smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. smftools/__init__.py +9 -4
  2. smftools/_version.py +1 -1
  3. smftools/cli.py +184 -0
  4. smftools/config/__init__.py +1 -0
  5. smftools/config/conversion.yaml +33 -0
  6. smftools/config/deaminase.yaml +56 -0
  7. smftools/config/default.yaml +253 -0
  8. smftools/config/direct.yaml +17 -0
  9. smftools/config/experiment_config.py +1191 -0
  10. smftools/hmm/HMM.py +1576 -0
  11. smftools/hmm/__init__.py +20 -0
  12. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  13. smftools/hmm/call_hmm_peaks.py +106 -0
  14. smftools/{tools → hmm}/display_hmm.py +3 -3
  15. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  16. smftools/{tools → hmm}/train_hmm.py +1 -1
  17. smftools/informatics/__init__.py +0 -2
  18. smftools/informatics/archived/deaminase_smf.py +132 -0
  19. smftools/informatics/fast5_to_pod5.py +4 -1
  20. smftools/informatics/helpers/__init__.py +3 -4
  21. smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
  22. smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
  23. smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
  24. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
  25. smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
  26. smftools/informatics/helpers/discover_input_files.py +100 -0
  27. smftools/informatics/helpers/extract_base_identities.py +29 -3
  28. smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
  29. smftools/informatics/helpers/find_conversion_sites.py +5 -4
  30. smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
  31. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  32. smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
  33. smftools/informatics/helpers/split_and_index_BAM.py +1 -5
  34. smftools/load_adata.py +1346 -0
  35. smftools/machine_learning/__init__.py +12 -0
  36. smftools/machine_learning/data/__init__.py +2 -0
  37. smftools/machine_learning/data/anndata_data_module.py +234 -0
  38. smftools/machine_learning/evaluation/__init__.py +2 -0
  39. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  40. smftools/machine_learning/evaluation/evaluators.py +223 -0
  41. smftools/machine_learning/inference/__init__.py +3 -0
  42. smftools/machine_learning/inference/inference_utils.py +27 -0
  43. smftools/machine_learning/inference/lightning_inference.py +68 -0
  44. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  45. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  46. smftools/machine_learning/models/base.py +295 -0
  47. smftools/machine_learning/models/cnn.py +138 -0
  48. smftools/machine_learning/models/lightning_base.py +345 -0
  49. smftools/machine_learning/models/mlp.py +26 -0
  50. smftools/{tools → machine_learning}/models/positional.py +3 -2
  51. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  52. smftools/machine_learning/models/sklearn_models.py +273 -0
  53. smftools/machine_learning/models/transformer.py +303 -0
  54. smftools/machine_learning/training/__init__.py +2 -0
  55. smftools/machine_learning/training/train_lightning_model.py +135 -0
  56. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  57. smftools/plotting/__init__.py +4 -1
  58. smftools/plotting/autocorrelation_plotting.py +611 -0
  59. smftools/plotting/general_plotting.py +566 -89
  60. smftools/plotting/hmm_plotting.py +260 -0
  61. smftools/plotting/qc_plotting.py +270 -0
  62. smftools/preprocessing/__init__.py +13 -8
  63. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  64. smftools/preprocessing/append_base_context.py +122 -0
  65. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  66. smftools/preprocessing/calculate_complexity_II.py +248 -0
  67. smftools/preprocessing/calculate_coverage.py +10 -1
  68. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  69. smftools/preprocessing/clean_NaN.py +17 -1
  70. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  71. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  72. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  73. smftools/preprocessing/invert_adata.py +12 -5
  74. smftools/preprocessing/load_sample_sheet.py +19 -4
  75. smftools/readwrite.py +849 -43
  76. smftools/tools/__init__.py +3 -32
  77. smftools/tools/calculate_umap.py +5 -5
  78. smftools/tools/general_tools.py +3 -3
  79. smftools/tools/position_stats.py +468 -106
  80. smftools/tools/read_stats.py +115 -1
  81. smftools/tools/spatial_autocorrelation.py +562 -0
  82. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
  83. smftools-0.2.1.dist-info/RECORD +161 -0
  84. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  85. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  86. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  87. smftools/informatics/load_adata.py +0 -182
  88. smftools/preprocessing/append_C_context.py +0 -82
  89. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  90. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  91. smftools/preprocessing/filter_reads_on_length.py +0 -51
  92. smftools/tools/call_hmm_peaks.py +0 -105
  93. smftools/tools/data/__init__.py +0 -2
  94. smftools/tools/data/anndata_data_module.py +0 -90
  95. smftools/tools/evaluation/__init__.py +0 -0
  96. smftools/tools/inference/__init__.py +0 -1
  97. smftools/tools/inference/lightning_inference.py +0 -41
  98. smftools/tools/models/base.py +0 -14
  99. smftools/tools/models/cnn.py +0 -34
  100. smftools/tools/models/lightning_base.py +0 -41
  101. smftools/tools/models/mlp.py +0 -17
  102. smftools/tools/models/sklearn_models.py +0 -40
  103. smftools/tools/models/transformer.py +0 -133
  104. smftools/tools/training/__init__.py +0 -1
  105. smftools/tools/training/train_lightning_model.py +0 -47
  106. smftools-0.1.7.dist-info/RECORD +0 -136
  107. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  108. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  109. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  110. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  111. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  112. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  113. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  114. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  115. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  116. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  117. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  118. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  119. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  120. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: smftools
3
- Version: 0.1.7
3
+ Version: 0.2.1
4
4
  Summary: Single Molecule Footprinting Analysis in Python.
5
5
  Project-URL: Source, https://github.com/jkmckenna/smftools
6
6
  Project-URL: Documentation, https://smftools.readthedocs.io/
@@ -46,6 +46,8 @@ Classifier: Topic :: Scientific/Engineering :: Visualization
46
46
  Requires-Python: >=3.9
47
47
  Requires-Dist: anndata>=0.10.0
48
48
  Requires-Dist: biopython>=1.79
49
+ Requires-Dist: captum
50
+ Requires-Dist: click
49
51
  Requires-Dist: fastcluster
50
52
  Requires-Dist: hydra-core
51
53
  Requires-Dist: igraph
@@ -64,8 +66,10 @@ Requires-Dist: scanpy>=1.9
64
66
  Requires-Dist: scikit-learn>=1.0.2
65
67
  Requires-Dist: scipy>=1.7.3
66
68
  Requires-Dist: seaborn>=0.11
69
+ Requires-Dist: shap
67
70
  Requires-Dist: torch>=1.9.0
68
71
  Requires-Dist: tqdm
72
+ Requires-Dist: upsetplot
69
73
  Requires-Dist: wandb
70
74
  Provides-Extra: docs
71
75
  Requires-Dist: ipython>=7.20; extra == 'docs'
@@ -0,0 +1,161 @@
1
+ smftools/__init__.py,sha256=OXW2_b5NUGZhTXsH8qY0PzfJnaz8T2y6lCqMnSVSuIk,676
2
+ smftools/_settings.py,sha256=Ed8lzKUA5ncq5ZRfSp0t6_rphEEjMxts6guttwTZP5Y,409
3
+ smftools/_version.py,sha256=tC9CwL4Nm8brVXJnZNGk_eoZaJj6eOtLKtOrdJMrpoI,21
4
+ smftools/cli.py,sha256=MNObu279y322JHkmugssM0rVHo0UQ1zboTG9MlqnMgQ,7033
5
+ smftools/load_adata.py,sha256=VJMUBqRC8InIj48JMnkZKLuqEz1u8uSTNx_ARl0cn7M,74313
6
+ smftools/readwrite.py,sha256=ObNxBj6Y_zIHqQpAvmHAddAypLjg7F3qARF-sH-V3do,42706
7
+ smftools/config/__init__.py,sha256=ObUnnR7aRSoD_uvpmsxA_BUFt4NOOfWNopDVCqjp7tg,69
8
+ smftools/config/conversion.yaml,sha256=rJGhrVd95p6_6OVxLq2lvobJu8SGzNYI80jU0fLeK_g,795
9
+ smftools/config/deaminase.yaml,sha256=Vh3Wg0bCb88S20Ob-8zi3eQJ1g_pcBulR9pPbAX9U1o,1138
10
+ smftools/config/default.yaml,sha256=0DYIvvdbzoB2eJgsoxEzx4Rc0TVGaiHa85nxo1VwCqQ,9704
11
+ smftools/config/direct.yaml,sha256=2F_fGploWW3f88Y7sTZ68Vk9fgNaO-sb5AK-Cutc2TQ,735
12
+ smftools/config/experiment_config.py,sha256=zQhWaag9hPuexnTOqZ-Od--c3iHs18c4Wc2sU-LOyts,52872
13
+ smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz,sha256=q6wJtgFRDln0o20XNCx1qad3lwcdCoylqPN7wskTfI8,2926497
14
+ smftools/datasets/F1_sample_sheet.csv,sha256=9PodIIOXK2eamYPbC6DGnXdzgi9bRDovf296j1aM0ak,259
15
+ smftools/datasets/__init__.py,sha256=xkSTlPuakVYVCuRurif9BceNBDt6bsngJvvjI8757QI,142
16
+ smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz,sha256=niOcVHaYY7h3XyvwSkN-V_NMBaRt2vTP5TrJO0CwMCs,8385050
17
+ smftools/datasets/datasets.py,sha256=0y597Ntp707bOgDwN6O-JEt9yxgplj66p0aj6Zs_IB4,779
18
+ smftools/hmm/HMM.py,sha256=K8rt-EHn3ylIHpQ3dHf_OZCXxCBVSS2UWTgSGOatwHw,71046
19
+ smftools/hmm/__init__.py,sha256=BkX145eGVy-kFOtyqOcu-Hzv9ZJLDQ3cfDe51eKBTwY,585
20
+ smftools/hmm/apply_hmm_batched.py,sha256=BBeJ8DiIuuMWzLwtDdk2DO2vvrfLCrVe4JtRYPFItIU,10648
21
+ smftools/hmm/calculate_distances.py,sha256=KDWimQ6u-coyxCKrbTm42Fh_Alf_gURBZ0vfFaem848,644
22
+ smftools/hmm/call_hmm_peaks.py,sha256=T-3Ld8H4t3Mgg2whBTYP9s2QL7rY-9RIzVCgB6avKhE,4625
23
+ smftools/hmm/display_hmm.py,sha256=3WuQCPvM3wPfzAdgbhfiBTd0g5mQdx9HTUdqAxs2aj4,825
24
+ smftools/hmm/hmm_readwrite.py,sha256=DjJ3hunpBQ7N0GVvxL7-0QUas_SkA88LVgL72mVK2cI,359
25
+ smftools/hmm/nucleosome_hmm_refinement.py,sha256=nQWimvse6dclcXhbU707rGbRVMKHM0mU_ZhH9g2yCMA,4641
26
+ smftools/hmm/train_hmm.py,sha256=srzRcB9LEmNuHyBM0R5Z0VEnxecifQt-MoaJhADxGT8,2477
27
+ smftools/informatics/__init__.py,sha256=8tvVG08L_Z-bP28PusBtVt1UTnHxuKi0lImLNcP7qso,338
28
+ smftools/informatics/basecall_pod5s.py,sha256=Ynmxscsxj6qp-zVY0RWodq513oDuHDaHnpqoepB3RUU,3930
29
+ smftools/informatics/fast5_to_pod5.py,sha256=h-cUZX5sWwPCkQ4g3kyz3koSBjZOWI6EjSpWO8zib1I,862
30
+ smftools/informatics/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
31
+ smftools/informatics/subsample_fasta_from_bed.py,sha256=YqYV09rvEQdeiS5hTTrKa8xYmJfeM3Vk-UUqwpw0qBk,1983
32
+ smftools/informatics/subsample_pod5.py,sha256=zDw9tRcrFRmPI62xkcy9dh8IfsJcuYm7R-FVeBC_g3s,4701
33
+ smftools/informatics/archived/bam_conversion.py,sha256=I8EzXjQixMmqx2oWnoNSH5NURBhfT-krbWHkoi_M964,3330
34
+ smftools/informatics/archived/bam_direct.py,sha256=jbEFtUIiUR8Wlp3po_sWkr19AUNS9WZjglojb9j28vo,3606
35
+ smftools/informatics/archived/basecalls_to_adata.py,sha256=-Nag6lr_NAtU4t8jo0GSMdgIAIfmDge-5VEUPQbEatE,3692
36
+ smftools/informatics/archived/conversion_smf.py,sha256=QhlISVi3Z-XqFKyDG_CenLojovAt5-ZhuVe9hus36lg,7177
37
+ smftools/informatics/archived/deaminase_smf.py,sha256=mNeg1mIYYVLIiW8powEpz0CqrGRDsrmY5-aoIgwMGHs,7221
38
+ smftools/informatics/archived/direct_smf.py,sha256=ylPGFBvRLdxLHeDJjAwq98j8Q8_lfGK3k5JJnQxrwJw,7485
39
+ smftools/informatics/archived/print_bam_query_seq.py,sha256=8Z2ZJEOOlfWYUXiZGjteLWU4yTgvV8KQzEIBHUmamGM,838
40
+ smftools/informatics/helpers/__init__.py,sha256=EgCIcJ6o3_R3vzsFwhtvOcKKWnmmMmN_GZXDQ_K_-NI,2693
41
+ smftools/informatics/helpers/align_and_sort_BAM.py,sha256=gy_BU6KfDd584LPFybJ7JzNwfCD95dZXx6MccnT4Qro,3725
42
+ smftools/informatics/helpers/aligned_BAM_to_bed.py,sha256=e6yg5-yHcw0QPFI3oRVHrhfAUj7US77Ir2VVzE3c-x8,3374
43
+ smftools/informatics/helpers/bam_qc.py,sha256=IlrXXpCdTYIv_89SE8D5tJ1wtTzxWGjk9vc-rbC1UjU,2430
44
+ smftools/informatics/helpers/bed_to_bigwig.py,sha256=AazYEZzKgKgukSFwCpeiApzxh1kbt11X4RFqRIiBIaY,1466
45
+ smftools/informatics/helpers/binarize_converted_base_identities.py,sha256=yOepGaNBGfZJEsMiLRwKauvsmaHn_JRrxaGp8LmKAXs,7778
46
+ smftools/informatics/helpers/canoncall.py,sha256=5WS6lwukc_xYTdPQy0OSj-WLbx0Rg70Cun1lCucY7w8,1741
47
+ smftools/informatics/helpers/complement_base_list.py,sha256=k6EkLtxFoajaIufxw1p0pShJ2nPHyGLTbzZmIFFjB4o,532
48
+ smftools/informatics/helpers/concatenate_fastqs_to_bam.py,sha256=0jy4H1ORuqaarsznv9tS1SM8CCRjaaD20NMknNvQPv0,16212
49
+ smftools/informatics/helpers/converted_BAM_to_adata.py,sha256=sRmOtn0kNosLYfogqslDHg1Azk51l6nfNOLgQOnQjlA,14591
50
+ smftools/informatics/helpers/converted_BAM_to_adata_II.py,sha256=9Tz-qWtK9v1DTlK6yManvhOlMcaHFQUmmrRZQ5eiECw,22229
51
+ smftools/informatics/helpers/count_aligned_reads.py,sha256=uYyUYglF1asiaoxr-LKxPMUEbfyD7FS-dumTg2hJHzQ,2170
52
+ smftools/informatics/helpers/demux_and_index_BAM.py,sha256=2B_UiU05ln3gYvcN9aC_w6qs8j_WAF4pHWZekAYsXm4,2114
53
+ smftools/informatics/helpers/discover_input_files.py,sha256=hUes2iKBQW_sVmAYD-1JnLD9Ub-COEHzrRKWNFipl0g,3725
54
+ smftools/informatics/helpers/extract_base_identities.py,sha256=2yvr5uff9ah0jylFjNMt7oRJb1z_YdhvM6htSxI0frg,3038
55
+ smftools/informatics/helpers/extract_mods.py,sha256=MbSIiyj3zx7WlSSWMRPriLMkBtxYc1EWZiAAirMVgqA,3865
56
+ smftools/informatics/helpers/extract_read_features_from_bam.py,sha256=SYAb4Q1HxiJzCx5bIz86MdH_TvVPsRAVodZD9082HGY,1491
57
+ smftools/informatics/helpers/extract_read_lengths_from_bed.py,sha256=Cw39wgp1eBTV45Wk1l0c9l-upBW5N2OcgyWXTAXln90,678
58
+ smftools/informatics/helpers/extract_readnames_from_BAM.py,sha256=3FxSNqbZ1VsOK2RfHrvevQTzhWATf5E8bZ5yVOqayvk,759
59
+ smftools/informatics/helpers/find_conversion_sites.py,sha256=JPlDipmzeCBkV_T6esGD5ptwmbQmk8gJMTh7NMaSYd4,2480
60
+ smftools/informatics/helpers/generate_converted_FASTA.py,sha256=UniQfERNt4FC5L8T1tzr4cLQOJc3wMBPhuWmC-lC8Fs,3747
61
+ smftools/informatics/helpers/get_chromosome_lengths.py,sha256=sLumLrGsU_Xg_oJcdOpQyjUGpJoT2HbcmxWwbwzXUlE,1036
62
+ smftools/informatics/helpers/get_native_references.py,sha256=fRuyEm9UJkfd5DwHmFb1bxEtNvtSI1_BxGRmrCymGkw,981
63
+ smftools/informatics/helpers/index_fasta.py,sha256=N3IErfSiavYldeaat8xcQgA1MpykoQHcE0gHUeWuClE,267
64
+ smftools/informatics/helpers/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
65
+ smftools/informatics/helpers/make_modbed.py,sha256=cOQ97gPfRiCcw_fqboxousXIiOYjp78IFYLbu749U1Y,939
66
+ smftools/informatics/helpers/modQC.py,sha256=LeOBObG8gAVVdgESIMceYhd5AW1gfN7ABo91OQtOzTM,1041
67
+ smftools/informatics/helpers/modcall.py,sha256=LVPrdMNVp2gyQTJ4BNp8NJNm89AueDjsKaY7Gqkluho,1777
68
+ smftools/informatics/helpers/modkit_extract_to_adata.py,sha256=yjG_Onh6YgwpE11I8mgQyN6F-8yStJgvDcI38L13i4A,52098
69
+ smftools/informatics/helpers/ohe_batching.py,sha256=QVOiyl9fYHNIFWM23afYnQo0uaOjf1NR3ASKGVSrmuw,2975
70
+ smftools/informatics/helpers/ohe_layers_decode.py,sha256=gIgUC9L8TFLi-fTnjR4PRzXdUaH5D6WL2Hump6XOoy0,1042
71
+ smftools/informatics/helpers/one_hot_decode.py,sha256=3n4rzY8_aC9YKmgrftsguMsH7fUyQ-DbWmrOYF6la9s,906
72
+ smftools/informatics/helpers/one_hot_encode.py,sha256=5hHigA6-SZLK84WH_RHo06F_6aTg7S3TJgvSr8gxGX8,1968
73
+ smftools/informatics/helpers/plot_bed_histograms.py,sha256=sdtz_ieU_5rz8WyfAzjxbzY_w8kLdE_Rklvjax1hl3Q,10442
74
+ smftools/informatics/helpers/run_multiqc.py,sha256=qkw48DeBdTEqzhKFGjMUlvNmTehp8wRPkcxdkwERkHc,980
75
+ smftools/informatics/helpers/separate_bam_by_bc.py,sha256=WJZwKCYODUvzFaVWwX3SUE8sxEXmeYmSi7Dl9h2J2EY,1802
76
+ smftools/informatics/helpers/split_and_index_BAM.py,sha256=yowMusTGoC7uRD0jAwOHzBegX6MV7f-uY-XSzkX5cBw,1253
77
+ smftools/informatics/helpers/archived/informatics.py,sha256=gKb2ZJ_LcAeEXuQqn9e-QDF_sS4tMpMTr2vZlqa7n54,14572
78
+ smftools/informatics/helpers/archived/load_adata.py,sha256=DhvYYqO9VLsZqhL1WjN9sd-e3fgvdXGlgTP18z1h0L0,33654
79
+ smftools/machine_learning/__init__.py,sha256=cWyGN_QVcssqBr_VVr7xh2Inz0P7ylqUmBBcpMgsK0k,257
80
+ smftools/machine_learning/data/__init__.py,sha256=xbfLE-gNjdgxvZ9LKTdvjAtbIHOcs2TR0Gz3YRFbo38,113
81
+ smftools/machine_learning/data/anndata_data_module.py,sha256=ktrdMVMk5yhIUrnu-G_Xf3y7G-KP9PyhYZhobv8TCVg,10063
82
+ smftools/machine_learning/data/preprocessing.py,sha256=dSs6Qs3wmlccFPZSpOc-uy1nlFSf68wWQKwF1iTqMok,137
83
+ smftools/machine_learning/evaluation/__init__.py,sha256=KHvcC7bTYv-ThptAi6G8wD-hW5Iz1HPgMcQ3AewtK3c,122
84
+ smftools/machine_learning/evaluation/eval_utils.py,sha256=t9WIevIJ6b6HqU6OYaNx7UBAa5TEIPFmZow6n_ZDZeY,1105
85
+ smftools/machine_learning/evaluation/evaluators.py,sha256=KqYHqbVV2WOs0Yo4GIhLS_0h1oKY6nd1yi6piDWYQLg,8184
86
+ smftools/machine_learning/inference/__init__.py,sha256=vWLQD-JNEKKNGuzDtx7vcE4czKKXEO6S-0Zp5-21fPs,172
87
+ smftools/machine_learning/inference/inference_utils.py,sha256=aJuXvTgC8v4BOjLCgOU9vT3S2y1UGoZjq4mQpPswTQU,947
88
+ smftools/machine_learning/inference/lightning_inference.py,sha256=34WVnPfpPDf4KM8ZN5MOsx4tYgsrUclkens6GXgB4Ek,2160
89
+ smftools/machine_learning/inference/sklearn_inference.py,sha256=FomgQF5jFBfAj1-H2Q0_RPmvR9rDJsmUeaWOVRhbpTw,1612
90
+ smftools/machine_learning/inference/sliding_window_inference.py,sha256=8zjQs2hGhj0Dww4gWljLVK0g002_U96dyIqQJiDdSDY,4426
91
+ smftools/machine_learning/models/__init__.py,sha256=bMfPbQ5bDmn_kWv82virLuUhjb12Yow7t_j96afNbyA,421
92
+ smftools/machine_learning/models/base.py,sha256=p3d77iyY8BVx0tYL0TjmOSnPNP1ZrKTzn_J05e2GF0A,9626
93
+ smftools/machine_learning/models/cnn.py,sha256=KKZmJLQ6Bjm_HI8GULnafjz6mRy5BZ6Y0ZCgDSuS268,4465
94
+ smftools/machine_learning/models/lightning_base.py,sha256=3nC3wajPIupFMtOq3YUf24_SHvDoW_9BIGyIvEwzN9w,13626
95
+ smftools/machine_learning/models/mlp.py,sha256=Y2hc_qHj6vpM_mHpreFxBULn4MkR25oEA1LXu5sPA_w,820
96
+ smftools/machine_learning/models/positional.py,sha256=EfTyYnY0pCB-aVJIWf-4DVNpyGlvx1q_09PzfrC-VlA,652
97
+ smftools/machine_learning/models/rnn.py,sha256=uJnHDGpT2_l_HqHGsx33XGF3v3EYZPeOtSQ89uvhdpE,717
98
+ smftools/machine_learning/models/sklearn_models.py,sha256=ssV-mR3rmcjycQEzKccRcbVaEjZp0zRNUL5-R6m1UKU,10402
99
+ smftools/machine_learning/models/transformer.py,sha256=8YXS0vCcOWT-33h-8yeDfFM5ibPHQ-CMSEhGWzR4pm8,11039
100
+ smftools/machine_learning/models/wrappers.py,sha256=HEY2A6-Bk6MtVZ9jOaPT8S1Qi0L98SyEg1nbKqYZoag,697
101
+ smftools/machine_learning/training/__init__.py,sha256=teUmwpnmAl0oNFaqVrfoijEpxBjLwI5YtBwLHT3uXck,185
102
+ smftools/machine_learning/training/train_lightning_model.py,sha256=usEBaQ4vNjfatefP5XDCXkywzgZ2D-YppGmT3-3gTGE,4070
103
+ smftools/machine_learning/training/train_sklearn_model.py,sha256=m1k1Gsynpj6SJI64rl4B3cfXm1SliU0fwMAj1-bAAeE,3166
104
+ smftools/machine_learning/utils/__init__.py,sha256=yOpzBc9AXbarSRfN8Ixh2Z1uWLGpgpjRR46h6E46_2w,62
105
+ smftools/machine_learning/utils/device.py,sha256=GITrULOty2Fr96Bqt1wi1PaYl_oVgB5Z99Gfn5vQy4o,274
106
+ smftools/machine_learning/utils/grl.py,sha256=BWBDp_kQBigrUzQpRbZzgpfr_WOcd2K2V3MQL-aAIc4,334
107
+ smftools/plotting/__init__.py,sha256=7T3-hZFgTY0nfQgV4J6Vn9ogwkNMlY315kguZR7V1AI,866
108
+ smftools/plotting/autocorrelation_plotting.py,sha256=wdqQ4dawibgZeXRs_G4WterkqOgxHWrJlgZ4PdtO-OA,27456
109
+ smftools/plotting/classifiers.py,sha256=8_zabh4NNB1_yVxLD22lfrfl5yfzbEoG3XWqlIqdtrQ,13786
110
+ smftools/plotting/general_plotting.py,sha256=JOeF_lq2hCDt1Vgy8KYLKSzNj8SXGi3z6qMt2P68TDU,31458
111
+ smftools/plotting/hmm_plotting.py,sha256=3Eq82gty_0b8GkSMCQgUlbKfzR9h2fJ5rZkB8yYGX-M,10934
112
+ smftools/plotting/position_stats.py,sha256=4XukYIWeWZ_aGSZg1K0t37KA2aknjNNKT5kcKFfuz8Q,17428
113
+ smftools/plotting/qc_plotting.py,sha256=q5Ri0q89udvNUFUNxHzgk9atvQYqUkqkS5-JFq9EqoI,10045
114
+ smftools/preprocessing/__init__.py,sha256=VqhiwJg57m0ePCRAGfX3cJniNLV2jNJpoXZEM2j-0wU,1687
115
+ smftools/preprocessing/add_read_length_and_mapping_qc.py,sha256=zD_Kxw3DvyOypfuSMGv0ESyt-02w4XlAAMqQxb7yDNQ,5700
116
+ smftools/preprocessing/append_base_context.py,sha256=ohtdHNS1Y9ttLvhLKSwrOyar7HyU2Dw0Ach9WVx5QM8,6221
117
+ smftools/preprocessing/append_binary_layer_by_base_context.py,sha256=I3iiZkVqqB1KqSiA-s-ctl-ESkuTpd7Ot82M0xv_Cm4,6202
118
+ smftools/preprocessing/binarize_on_Youden.py,sha256=O5E3vFc2zXMfKW0p0JGDlmRKEx2_VP6dAqfvrumzz00,1797
119
+ smftools/preprocessing/binary_layers_to_ohe.py,sha256=Lxd8knelNTaUozfGMFNMlnrOb6uP28Laj3Ymw6cRHL0,1826
120
+ smftools/preprocessing/calculate_complexity.py,sha256=cXMpFrhkwkPipQo2GZGT5yFknMYUMt1t8gz0Cse1DrA,3288
121
+ smftools/preprocessing/calculate_complexity_II.py,sha256=DGfl0jkuBPUpzhKVItN0W7EPzh-QYuR4IxRObPE6gAQ,9301
122
+ smftools/preprocessing/calculate_consensus.py,sha256=6zRpRmb2xdfDu5hctZrReALRb7Pjn8sy8xJZTm3o0nU,2442
123
+ smftools/preprocessing/calculate_coverage.py,sha256=4WTILzKLzxGLSsQrZkshXP-IRQpoVu3Fkqc0QTpux3Y,2132
124
+ smftools/preprocessing/calculate_pairwise_differences.py,sha256=5zJbNNaFld5qgKRoPyplCmMHflbvAQ9eKWCXPXPpJ60,1774
125
+ smftools/preprocessing/calculate_pairwise_hamming_distances.py,sha256=e5Mzyex7pT29H2PY014uU4Fi_eewbut1JkzC1ffBbCg,961
126
+ smftools/preprocessing/calculate_position_Youden.py,sha256=9GY_WWwaxpB2Xppck3WT1zHtFOhTXrpuDIgbxLC9A7E,7450
127
+ smftools/preprocessing/calculate_read_length_stats.py,sha256=gNNePwMqYZJidzGgT1ZkfSlvc5Y3I3bi5KNYpP6wQQc,4584
128
+ smftools/preprocessing/calculate_read_modification_stats.py,sha256=fQYtwsGt6zq7QBlWtAEaFOkbV_4yXjrj9GnBryEEztc,4779
129
+ smftools/preprocessing/clean_NaN.py,sha256=IOcnN5YF05gpPQc3cc3IS83petCnhCpkYiyT6bXEyx0,1937
130
+ smftools/preprocessing/filter_adata_by_nan_proportion.py,sha256=GZcvr2JCsthX8EMw34S9-W3fc6JElw6ka99Jy6f2JvA,1292
131
+ smftools/preprocessing/filter_reads_on_length_quality_mapping.py,sha256=93LgTy_vsPnOZgoiXhZ1-w_pix2oFdBk-dsBUoz33Go,7379
132
+ smftools/preprocessing/filter_reads_on_modification_thresholds.py,sha256=wOmHhQj3xQALQdtQ4-v4POEOat5bEJa-BVmzEE_yrKA,19403
133
+ smftools/preprocessing/flag_duplicate_reads.py,sha256=D7KrDuyy_TSgGvB5aRRmY01k36p92n48YEwmwsUd3IY,65595
134
+ smftools/preprocessing/invert_adata.py,sha256=HYMJ1sR3Ui8j6bDjY8OcVQOETzZV-_rrpIYaWLZL6S4,1049
135
+ smftools/preprocessing/load_sample_sheet.py,sha256=AjJf2MrqGHJJ2rNjYi09zV1QkLTq8qGaHGVklXHnPuU,1908
136
+ smftools/preprocessing/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
137
+ smftools/preprocessing/min_non_diagonal.py,sha256=hx1asW8CEmLaIroZISW8EcAf_RnBEC_nofGD8QG0b1E,711
138
+ smftools/preprocessing/recipes.py,sha256=cfKEpKW8TtQLe1CMdSHyPuIgKiWOPn7uP6uMIoRlnaQ,7063
139
+ smftools/preprocessing/subsample_adata.py,sha256=ivJvJIOvEtyvAjqZ7cwEeVedm4QgJxCJEI7sFaTuI3w,2360
140
+ smftools/preprocessing/archives/mark_duplicates.py,sha256=kwfstcWb7KkqeNB321dB-NLe8yd9_hZsSmpL8pCVBQg,8747
141
+ smftools/preprocessing/archives/preprocessing.py,sha256=4mLT09A7vwRZ78FHmuwtv38mH9TQ9qrZc_WjHRhhkIw,34379
142
+ smftools/preprocessing/archives/remove_duplicates.py,sha256=Erooi5_1VOUNfWpzddzmMNYMCl1U1jJryt7ZtMhabAs,699
143
+ smftools/tools/__init__.py,sha256=QV3asy5_lP9wcRzpNTfxGTCcpykkbNYvzxSMpFw4KXU,719
144
+ smftools/tools/calculate_umap.py,sha256=2arbAQdFOtnWoPq22TWicyr6fLYZ5PTNeZv_jdwuk_I,2491
145
+ smftools/tools/cluster_adata_on_methylation.py,sha256=UDC5lpW8fZ6O-16ETu-mbflLkNBKuIg7RIzQ9r7knvA,5760
146
+ smftools/tools/general_tools.py,sha256=YbobB6Zllz6cUq50yolGH9Jr6uuAMvEI4m3hiJ6FmAI,2561
147
+ smftools/tools/position_stats.py,sha256=Z7VW54wUVzH1RQ9xhP6KO7ewp-xeLybd07I5umV_aqM,24369
148
+ smftools/tools/read_stats.py,sha256=w3Zaim6l__Kt8EPCJKXTlMgO51Iy2Milj6yUb88HXiI,6324
149
+ smftools/tools/spatial_autocorrelation.py,sha256=uQkuPi2PJCj5lZzb33IWTL-e-p3J6PdMeM88rUFfQRw,21212
150
+ smftools/tools/subset_adata.py,sha256=nBbtAxCNteZCUBmPnZ9swQNyU74XgWM8aJHHWg2AuL0,1025
151
+ smftools/tools/archived/apply_hmm.py,sha256=pJXCULay0zbmubrwql368y7yiHAZr2bJhuGx2QUuKnE,9321
152
+ smftools/tools/archived/classifiers.py,sha256=mwSTpWUXBPjmUuV5i_SMG1lIPpHSMCzsKhl8wTbm-Og,36903
153
+ smftools/tools/archived/classify_methylated_features.py,sha256=Z0N2UKw3luD3CTQ8wcUvdnMY7w-8574OJbEcwzNsy88,2897
154
+ smftools/tools/archived/classify_non_methylated_features.py,sha256=IJERTozEs7IPL7K-VIjq2q2K36wRCW9iiNSYLAXasrA,3256
155
+ smftools/tools/archived/subset_adata_v1.py,sha256=qyU9iCal03edb5aUS3AZ2U4TlL3uQ42jGI9hX3QF7Fc,1047
156
+ smftools/tools/archived/subset_adata_v2.py,sha256=OKZoUpvdURPtckIQxGTWmOI5jLa-_EU62Xs3LyyehnA,1880
157
+ smftools-0.2.1.dist-info/METADATA,sha256=MXyiJbt1w_Ln4ENxQNbLU0JWwE-S6z1oNZkd8gkf3J8,8958
158
+ smftools-0.2.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
159
+ smftools-0.2.1.dist-info/entry_points.txt,sha256=NflK6zRv2zlvnjCnDSHycp9w9CczHLfGz9zAc4FtI0I,46
160
+ smftools-0.2.1.dist-info/licenses/LICENSE,sha256=F8LwmL6vMPddaCt1z1S83Kh_OZv50alTlY7BvVx1RXw,1066
161
+ smftools-0.2.1.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ smftools = smftools.cli:cli
@@ -1,75 +0,0 @@
1
- ## LoadExperimentConfig
2
-
3
- class LoadExperimentConfig:
4
- """
5
- Loads in the experiment configuration csv and saves global variables with experiment configuration parameters.
6
- Parameters:
7
- experiment_config (str): A string representing the file path to the experiment configuration csv file.
8
-
9
- Attributes:
10
- var_dict (dict): A dictionary containing experiment configuration parameters.
11
-
12
- Example:
13
- >>> import pandas as pd
14
- >>> from io import StringIO
15
- >>> csv_data = '''variable,value,type
16
- ... mapping_threshold,0.05,float
17
- ... batch_size,4,int
18
- ... testing_bool,True,bool
19
- ... strands,"[bottom, top]",list
20
- ... split_dir,split_bams,string
21
- ... pod5_dir,None,string
22
- ... pod5_dir,,string
23
- ... '''
24
- >>> csv_file = StringIO(csv_data)
25
- >>> df = pd.read_csv(csv_file)
26
- >>> df.to_csv('test_config.csv', index=False)
27
- >>> config_loader = LoadExperimentConfig('test_config.csv')
28
- >>> config_loader.var_dict['mapping_threshold']
29
- 0.05
30
- >>> config_loader.var_dict['batch_size']
31
- 4
32
- >>> config_loader.var_dict['testing_bool']
33
- True
34
- >>> config_loader.var_dict['strands']
35
- ['bottom', 'top']
36
- >>> config_loader.var_dict['split_dir']
37
- 'split_bams'
38
- >>> config_loader.var_dict['pod5_dir'] is None
39
- True
40
- >>> config_loader.var_dict['pod5_dir'] is None
41
- True
42
- """
43
- def __init__(self, experiment_config):
44
- import pandas as pd
45
- print(f"Loading experiment config from {experiment_config}")
46
- # Read the CSV into a pandas DataFrame
47
- df = pd.read_csv(experiment_config)
48
- # Initialize an empty dictionary to store variables
49
- var_dict = {}
50
- # Iterate through each row in the DataFrame
51
- for _, row in df.iterrows():
52
- var_name = str(row['variable'])
53
- value = row['value']
54
- dtype = row['type']
55
- # Handle empty and None values
56
- if pd.isna(value) or value in ['None', '']:
57
- value = None
58
- else:
59
- # Handle different data types
60
- if dtype == 'list':
61
- # Convert the string representation of a list to an actual list
62
- value = value.strip('()[]').replace(', ', ',').split(',')
63
- elif dtype == 'int':
64
- value = int(value)
65
- elif dtype == 'float':
66
- value = float(value)
67
- elif dtype == 'bool':
68
- value = value.lower() == 'true'
69
- elif dtype == 'string':
70
- value = str(value)
71
- # Store the variable in the dictionary
72
- var_dict[var_name] = value
73
- # Save the dictionary as an attribute of the class
74
- self.var_dict = var_dict
75
-
@@ -1,53 +0,0 @@
1
- # plot_read_length_and_coverage_histograms
2
-
3
- def plot_read_length_and_coverage_histograms(bed_file, plotting_directory):
4
- """
5
- Plots read length and coverage statistics for each record.
6
-
7
- Parameters:
8
- bed_file (str): Path to the bed file to derive read lengths and coverage from.
9
- plot_directory (str): Path to the directory to write out historgrams.
10
-
11
- Returns:
12
- None
13
- """
14
- import pandas as pd
15
- import matplotlib.pyplot as plt
16
- import numpy as np
17
- import os
18
-
19
- bed_basename = os.path.basename(bed_file).split('.bed')[0]
20
- # Load the BED file into a DataFrame
21
- print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
22
- df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name'])
23
-
24
- # Group by chromosome
25
- grouped = df.groupby('chromosome')
26
-
27
- for chrom, group in grouped:
28
- # Plot read length histogram
29
- plt.figure(figsize=(12, 6))
30
- plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
31
- plt.title(f'Read Length Histogram of reads aligned to {chrom}')
32
- plt.xlabel('Read Length')
33
- plt.ylabel('Count')
34
- plt.grid(True)
35
- save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
36
- plt.savefig(save_name)
37
- plt.close()
38
-
39
- # Compute coverage
40
- coverage = np.zeros(group['end'].max())
41
- for _, row in group.iterrows():
42
- coverage[row['start']:row['end']] += 1
43
-
44
- # Plot coverage histogram
45
- plt.figure(figsize=(12, 6))
46
- plt.plot(coverage, color='b')
47
- plt.title(f'Coverage Histogram for {chrom}')
48
- plt.xlabel('Position')
49
- plt.ylabel('Coverage')
50
- plt.grid(True)
51
- save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
52
- plt.savefig(save_name)
53
- plt.close()
@@ -1,182 +0,0 @@
1
- ## load_adata
2
-
3
- def load_adata(config_path):
4
- """
5
- High-level function to call for converting raw sequencing data to an adata object.
6
- Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
7
- Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
8
- Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
9
-
10
- Parameters:
11
- config_path (str): A string representing the file path to the experiment configuration csv file.
12
-
13
- Returns:
14
- None
15
- """
16
- # Lazy importing of packages
17
- from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam, extract_read_features_from_bam
18
- from .fast5_to_pod5 import fast5_to_pod5
19
- from .subsample_fasta_from_bed import subsample_fasta_from_bed
20
- import os
21
- import numpy as np
22
- import anndata as ad
23
- from pathlib import Path
24
-
25
- # Default params
26
- bam_suffix = '.bam' # If different, change from here.
27
- split_dir = 'demultiplexed_BAMs' # If different, change from here.
28
- strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
29
- conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
30
-
31
- # Load experiment config parameters into global variables
32
- experiment_config = LoadExperimentConfig(config_path)
33
- var_dict = experiment_config.var_dict
34
-
35
- # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
36
- default_value = None
37
-
38
- # General config variable init
39
- smf_modality = var_dict.get('smf_modality', default_value) # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Necessary.
40
- input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
41
- output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
42
- fasta = var_dict.get('fasta', default_value) # Path to reference FASTA.
43
- fasta_regions_of_interest = var_dict.get("fasta_regions_of_interest", default_value) # Path to a bed file listing coordinate regions of interest within the FASTA to include. Optional.
44
- mapping_threshold = var_dict.get('mapping_threshold', default_value) # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
45
- experiment_name = var_dict.get('experiment_name', default_value) # A key term to add to the AnnData file name.
46
- model_dir = var_dict.get('model_dir', default_value) # needed for dorado basecaller
47
- model = var_dict.get('model', default_value) # needed for dorado basecaller
48
- barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
49
- barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
50
- trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
51
- input_already_demuxed = var_dict.get('input_already_demuxed', default_value) # If the input files are already demultiplexed.
52
- threads = var_dict.get('threads', default_value) # number of cpu threads available for multiprocessing
53
- # Conversion specific variable init
54
- conversion_types = var_dict.get('conversion_types', default_value)
55
- # Direct methylation specific variable init
56
- filter_threshold = var_dict.get('filter_threshold', default_value)
57
- m6A_threshold = var_dict.get('m6A_threshold', default_value)
58
- m5C_threshold = var_dict.get('m5C_threshold', default_value)
59
- hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
60
- thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
61
- mod_list = var_dict.get('mod_list', default_value)
62
- batch_size = var_dict.get('batch_size', default_value)
63
- device = var_dict.get('device', 'auto')
64
- make_bigwigs = var_dict.get('make_bigwigs', default_value)
65
- skip_unclassified = var_dict.get('skip_unclassified', True)
66
- delete_batch_hdfs = var_dict.get('delete_batch_hdfs', True)
67
-
68
- # Make initial output directory
69
- make_dirs([output_directory])
70
- os.chdir(output_directory)
71
- # Define the pathname to split BAMs into later during demultiplexing.
72
- split_path = os.path.join(output_directory, split_dir)
73
-
74
- # If fasta_regions_of_interest is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
75
- if fasta_regions_of_interest and '.bed' in fasta_regions_of_interest:
76
- fasta_basename = os.path.basename(fasta).split('.fa')[0]
77
- bed_basename_minus_suffix = os.path.basename(fasta_regions_of_interest).split('.bed')[0]
78
- output_FASTA = fasta_basename + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta'
79
- subsample_fasta_from_bed(fasta, fasta_regions_of_interest, output_directory, output_FASTA)
80
- fasta = os.path.join(output_directory, output_FASTA)
81
-
82
- # If conversion_types is passed:
83
- if conversion_types:
84
- conversions += conversion_types
85
-
86
- # Get the input filetype
87
- if Path(input_data_path).is_file():
88
- input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
89
- input_is_pod5 = input_data_filetype in ['.pod5','.p5']
90
- input_is_fast5 = input_data_filetype in ['.fast5','.f5']
91
- input_is_fastq = input_data_filetype in ['.fastq', '.fq']
92
- input_is_bam = input_data_filetype == bam_suffix
93
- if input_is_fastq:
94
- fastq_paths = [input_data_path]
95
- elif Path(input_data_path).is_dir():
96
- # Get the file names in the input data dir
97
- input_files = os.listdir(input_data_path)
98
- input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
99
- input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
100
- input_is_fastq = sum([True for file in input_files if '.fastq' in file or '.fq' in file])
101
- input_is_bam = sum([True for file in input_files if bam_suffix in file])
102
- if input_is_fastq:
103
- fastq_paths = [os.path.join(input_data_path, file) for file in input_files if '.fastq' in file or '.fq' in file]
104
-
105
- # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
106
- if input_is_fast5 and not input_is_pod5:
107
- # take the input directory of fast5 files and write out a single pod5 file into the output directory.
108
- output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
109
- print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
110
- fast5_to_pod5(input_data_path, output_pod5)
111
- # Reassign the pod5_dir variable to point to the new pod5 file.
112
- input_data_path = output_pod5
113
- input_is_pod5 = True
114
- input_is_fast5 = False
115
-
116
- elif input_is_fastq:
117
- output_bam = os.path.join(output_directory, 'FASTQs_concatenated_into_BAM.bam')
118
- concatenate_fastqs_to_bam(fastq_paths, output_bam, barcode_tag='BC', gzip_suffix='.gz')
119
- input_data_path = output_bam
120
- input_is_bam = True
121
- input_is_fastq = False
122
-
123
- if input_is_pod5:
124
- basecall = True
125
- elif input_is_bam:
126
- basecall = False
127
- else:
128
- print('Error, can not find input bam or pod5')
129
-
130
- if smf_modality == 'conversion':
131
- from .conversion_smf import conversion_smf
132
- final_adata, final_adata_path, sorted_output, bam_files = conversion_smf(fasta, output_directory, conversions, strands, model_dir, model, input_data_path, split_path
133
- , barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed)
134
- elif smf_modality == 'direct':
135
- from .direct_smf import direct_smf
136
- # need to add input_already_demuxed workflow here.
137
- final_adata, final_adata_path, sorted_output, bam_files = direct_smf(fasta, output_directory, mod_list,model_dir, model, thresholds, input_data_path, split_path
138
- , barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads)
139
- else:
140
- print("Error")
141
-
142
- # Read in the final adata object and append final metadata
143
- #print(f'Reading in adata from {final_adata_path} to add final metadata')
144
- # final_adata = ad.read_h5ad(final_adata_path)
145
-
146
- # Adding read query length metadata to adata object.
147
- read_metrics = {}
148
- for bam_file in bam_files:
149
- bam_read_metrics = extract_read_features_from_bam(bam_file)
150
- read_metrics.update(bam_read_metrics)
151
- #read_metrics = extract_read_features_from_bam(sorted_output)
152
-
153
- query_read_length_values = []
154
- query_read_quality_values = []
155
- reference_lengths = []
156
- # Iterate over each row of the AnnData object
157
- for obs_name in final_adata.obs_names:
158
- # Fetch the value from the dictionary using the obs_name as the key
159
- value = read_metrics.get(obs_name, np.nan) # Use np.nan if the key is not found
160
- if type(value) is list:
161
- query_read_length_values.append(value[0])
162
- query_read_quality_values.append(value[1])
163
- reference_lengths.append(value[2])
164
- else:
165
- query_read_length_values.append(value)
166
- query_read_quality_values.append(value)
167
- reference_lengths.append(value)
168
-
169
- # Add the new column to adata.obs
170
- final_adata.obs['query_read_length'] = query_read_length_values
171
- final_adata.obs['query_read_quality'] = query_read_quality_values
172
- final_adata.obs['query_length_to_reference_length_ratio'] = np.array(query_read_length_values) / np.array(reference_lengths)
173
-
174
- final_adata.obs['Raw_methylation_signal'] = np.nansum(final_adata.X, axis=1)
175
- final_adata.obs['Raw_per_base_methylation_average'] = final_adata.obs['Raw_methylation_signal'] / final_adata.obs['query_read_length']
176
-
177
- print('Saving final adata')
178
- if ".gz" in final_adata_path:
179
- final_adata.write_h5ad(f"{final_adata_path}", compression='gzip')
180
- else:
181
- final_adata.write_h5ad(f"{final_adata_path}.gz", compression='gzip')
182
- print('Final adata saved')
@@ -1,82 +0,0 @@
1
- ## append_C_context
2
-
3
- ## Conversion SMF Specific
4
- # Read methylation QC
5
- def append_C_context(adata, obs_column='Reference', use_consensus=False, native=False):
6
- """
7
- Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
8
-
9
- Parameters:
10
- adata (AnnData): The input adata object.
11
- obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
12
- use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
13
- native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
14
-
15
- Returns:
16
- None
17
- """
18
- import numpy as np
19
- import anndata as ad
20
-
21
- print('Adding Cytosine context based on reference FASTA sequence for sample')
22
-
23
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C', 'any_C_site']
24
- categories = adata.obs[obs_column].cat.categories
25
- for cat in categories:
26
- # Assess if the strand is the top or bottom strand converted
27
- if 'top' in cat:
28
- strand = 'top'
29
- elif 'bottom' in cat:
30
- strand = 'bottom'
31
-
32
- if native:
33
- basename = cat.split(f"_{strand}")[0]
34
- if use_consensus:
35
- sequence = adata.uns[f'{basename}_consensus_sequence']
36
- else:
37
- # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
38
- sequence = adata.uns[f'{basename}_FASTA_sequence']
39
- else:
40
- basename = cat.split(f"_{strand}")[0]
41
- if use_consensus:
42
- sequence = adata.uns[f'{basename}_consensus_sequence']
43
- else:
44
- # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
45
- sequence = adata.uns[f'{basename}_FASTA_sequence']
46
- # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
47
- boolean_dict = {}
48
- for site_type in site_types:
49
- boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
50
-
51
- if strand == 'top':
52
- # Iterate through the sequence and apply the criteria
53
- for i in range(1, len(sequence) - 1):
54
- if sequence[i] == 'C':
55
- boolean_dict[f'{cat}_any_C_site'][i] = True
56
- if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
57
- boolean_dict[f'{cat}_GpC_site'][i] = True
58
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
59
- boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
60
- elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
61
- boolean_dict[f'{cat}_CpG_site'][i] = True
62
- elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
63
- boolean_dict[f'{cat}_other_C'][i] = True
64
- elif strand == 'bottom':
65
- # Iterate through the sequence and apply the criteria
66
- for i in range(1, len(sequence) - 1):
67
- if sequence[i] == 'G':
68
- boolean_dict[f'{cat}_any_C_site'][i] = True
69
- if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
70
- boolean_dict[f'{cat}_GpC_site'][i] = True
71
- elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
72
- boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
73
- elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
74
- boolean_dict[f'{cat}_CpG_site'][i] = True
75
- elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
76
- boolean_dict[f'{cat}_other_C'][i] = True
77
- else:
78
- print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
79
-
80
- for site_type in site_types:
81
- adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
82
- adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X