smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +9 -4
- smftools/_version.py +1 -1
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +0 -2
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/fast5_to_pod5.py +4 -1
- smftools/informatics/helpers/__init__.py +3 -4
- smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
- smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
- smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +29 -3
- smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
- smftools/informatics/helpers/find_conversion_sites.py +5 -4
- smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
- smftools/informatics/helpers/split_and_index_BAM.py +1 -5
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/general_plotting.py +566 -89
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +13 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +849 -43
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/evaluation/__init__.py +0 -0
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: smftools
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Single Molecule Footprinting Analysis in Python.
|
|
5
5
|
Project-URL: Source, https://github.com/jkmckenna/smftools
|
|
6
6
|
Project-URL: Documentation, https://smftools.readthedocs.io/
|
|
@@ -46,6 +46,8 @@ Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
|
46
46
|
Requires-Python: >=3.9
|
|
47
47
|
Requires-Dist: anndata>=0.10.0
|
|
48
48
|
Requires-Dist: biopython>=1.79
|
|
49
|
+
Requires-Dist: captum
|
|
50
|
+
Requires-Dist: click
|
|
49
51
|
Requires-Dist: fastcluster
|
|
50
52
|
Requires-Dist: hydra-core
|
|
51
53
|
Requires-Dist: igraph
|
|
@@ -64,8 +66,10 @@ Requires-Dist: scanpy>=1.9
|
|
|
64
66
|
Requires-Dist: scikit-learn>=1.0.2
|
|
65
67
|
Requires-Dist: scipy>=1.7.3
|
|
66
68
|
Requires-Dist: seaborn>=0.11
|
|
69
|
+
Requires-Dist: shap
|
|
67
70
|
Requires-Dist: torch>=1.9.0
|
|
68
71
|
Requires-Dist: tqdm
|
|
72
|
+
Requires-Dist: upsetplot
|
|
69
73
|
Requires-Dist: wandb
|
|
70
74
|
Provides-Extra: docs
|
|
71
75
|
Requires-Dist: ipython>=7.20; extra == 'docs'
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
smftools/__init__.py,sha256=OXW2_b5NUGZhTXsH8qY0PzfJnaz8T2y6lCqMnSVSuIk,676
|
|
2
|
+
smftools/_settings.py,sha256=Ed8lzKUA5ncq5ZRfSp0t6_rphEEjMxts6guttwTZP5Y,409
|
|
3
|
+
smftools/_version.py,sha256=tC9CwL4Nm8brVXJnZNGk_eoZaJj6eOtLKtOrdJMrpoI,21
|
|
4
|
+
smftools/cli.py,sha256=MNObu279y322JHkmugssM0rVHo0UQ1zboTG9MlqnMgQ,7033
|
|
5
|
+
smftools/load_adata.py,sha256=VJMUBqRC8InIj48JMnkZKLuqEz1u8uSTNx_ARl0cn7M,74313
|
|
6
|
+
smftools/readwrite.py,sha256=ObNxBj6Y_zIHqQpAvmHAddAypLjg7F3qARF-sH-V3do,42706
|
|
7
|
+
smftools/config/__init__.py,sha256=ObUnnR7aRSoD_uvpmsxA_BUFt4NOOfWNopDVCqjp7tg,69
|
|
8
|
+
smftools/config/conversion.yaml,sha256=rJGhrVd95p6_6OVxLq2lvobJu8SGzNYI80jU0fLeK_g,795
|
|
9
|
+
smftools/config/deaminase.yaml,sha256=Vh3Wg0bCb88S20Ob-8zi3eQJ1g_pcBulR9pPbAX9U1o,1138
|
|
10
|
+
smftools/config/default.yaml,sha256=0DYIvvdbzoB2eJgsoxEzx4Rc0TVGaiHa85nxo1VwCqQ,9704
|
|
11
|
+
smftools/config/direct.yaml,sha256=2F_fGploWW3f88Y7sTZ68Vk9fgNaO-sb5AK-Cutc2TQ,735
|
|
12
|
+
smftools/config/experiment_config.py,sha256=zQhWaag9hPuexnTOqZ-Od--c3iHs18c4Wc2sU-LOyts,52872
|
|
13
|
+
smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz,sha256=q6wJtgFRDln0o20XNCx1qad3lwcdCoylqPN7wskTfI8,2926497
|
|
14
|
+
smftools/datasets/F1_sample_sheet.csv,sha256=9PodIIOXK2eamYPbC6DGnXdzgi9bRDovf296j1aM0ak,259
|
|
15
|
+
smftools/datasets/__init__.py,sha256=xkSTlPuakVYVCuRurif9BceNBDt6bsngJvvjI8757QI,142
|
|
16
|
+
smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz,sha256=niOcVHaYY7h3XyvwSkN-V_NMBaRt2vTP5TrJO0CwMCs,8385050
|
|
17
|
+
smftools/datasets/datasets.py,sha256=0y597Ntp707bOgDwN6O-JEt9yxgplj66p0aj6Zs_IB4,779
|
|
18
|
+
smftools/hmm/HMM.py,sha256=K8rt-EHn3ylIHpQ3dHf_OZCXxCBVSS2UWTgSGOatwHw,71046
|
|
19
|
+
smftools/hmm/__init__.py,sha256=BkX145eGVy-kFOtyqOcu-Hzv9ZJLDQ3cfDe51eKBTwY,585
|
|
20
|
+
smftools/hmm/apply_hmm_batched.py,sha256=BBeJ8DiIuuMWzLwtDdk2DO2vvrfLCrVe4JtRYPFItIU,10648
|
|
21
|
+
smftools/hmm/calculate_distances.py,sha256=KDWimQ6u-coyxCKrbTm42Fh_Alf_gURBZ0vfFaem848,644
|
|
22
|
+
smftools/hmm/call_hmm_peaks.py,sha256=T-3Ld8H4t3Mgg2whBTYP9s2QL7rY-9RIzVCgB6avKhE,4625
|
|
23
|
+
smftools/hmm/display_hmm.py,sha256=3WuQCPvM3wPfzAdgbhfiBTd0g5mQdx9HTUdqAxs2aj4,825
|
|
24
|
+
smftools/hmm/hmm_readwrite.py,sha256=DjJ3hunpBQ7N0GVvxL7-0QUas_SkA88LVgL72mVK2cI,359
|
|
25
|
+
smftools/hmm/nucleosome_hmm_refinement.py,sha256=nQWimvse6dclcXhbU707rGbRVMKHM0mU_ZhH9g2yCMA,4641
|
|
26
|
+
smftools/hmm/train_hmm.py,sha256=srzRcB9LEmNuHyBM0R5Z0VEnxecifQt-MoaJhADxGT8,2477
|
|
27
|
+
smftools/informatics/__init__.py,sha256=8tvVG08L_Z-bP28PusBtVt1UTnHxuKi0lImLNcP7qso,338
|
|
28
|
+
smftools/informatics/basecall_pod5s.py,sha256=Ynmxscsxj6qp-zVY0RWodq513oDuHDaHnpqoepB3RUU,3930
|
|
29
|
+
smftools/informatics/fast5_to_pod5.py,sha256=h-cUZX5sWwPCkQ4g3kyz3koSBjZOWI6EjSpWO8zib1I,862
|
|
30
|
+
smftools/informatics/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
|
|
31
|
+
smftools/informatics/subsample_fasta_from_bed.py,sha256=YqYV09rvEQdeiS5hTTrKa8xYmJfeM3Vk-UUqwpw0qBk,1983
|
|
32
|
+
smftools/informatics/subsample_pod5.py,sha256=zDw9tRcrFRmPI62xkcy9dh8IfsJcuYm7R-FVeBC_g3s,4701
|
|
33
|
+
smftools/informatics/archived/bam_conversion.py,sha256=I8EzXjQixMmqx2oWnoNSH5NURBhfT-krbWHkoi_M964,3330
|
|
34
|
+
smftools/informatics/archived/bam_direct.py,sha256=jbEFtUIiUR8Wlp3po_sWkr19AUNS9WZjglojb9j28vo,3606
|
|
35
|
+
smftools/informatics/archived/basecalls_to_adata.py,sha256=-Nag6lr_NAtU4t8jo0GSMdgIAIfmDge-5VEUPQbEatE,3692
|
|
36
|
+
smftools/informatics/archived/conversion_smf.py,sha256=QhlISVi3Z-XqFKyDG_CenLojovAt5-ZhuVe9hus36lg,7177
|
|
37
|
+
smftools/informatics/archived/deaminase_smf.py,sha256=mNeg1mIYYVLIiW8powEpz0CqrGRDsrmY5-aoIgwMGHs,7221
|
|
38
|
+
smftools/informatics/archived/direct_smf.py,sha256=ylPGFBvRLdxLHeDJjAwq98j8Q8_lfGK3k5JJnQxrwJw,7485
|
|
39
|
+
smftools/informatics/archived/print_bam_query_seq.py,sha256=8Z2ZJEOOlfWYUXiZGjteLWU4yTgvV8KQzEIBHUmamGM,838
|
|
40
|
+
smftools/informatics/helpers/__init__.py,sha256=EgCIcJ6o3_R3vzsFwhtvOcKKWnmmMmN_GZXDQ_K_-NI,2693
|
|
41
|
+
smftools/informatics/helpers/align_and_sort_BAM.py,sha256=gy_BU6KfDd584LPFybJ7JzNwfCD95dZXx6MccnT4Qro,3725
|
|
42
|
+
smftools/informatics/helpers/aligned_BAM_to_bed.py,sha256=e6yg5-yHcw0QPFI3oRVHrhfAUj7US77Ir2VVzE3c-x8,3374
|
|
43
|
+
smftools/informatics/helpers/bam_qc.py,sha256=IlrXXpCdTYIv_89SE8D5tJ1wtTzxWGjk9vc-rbC1UjU,2430
|
|
44
|
+
smftools/informatics/helpers/bed_to_bigwig.py,sha256=AazYEZzKgKgukSFwCpeiApzxh1kbt11X4RFqRIiBIaY,1466
|
|
45
|
+
smftools/informatics/helpers/binarize_converted_base_identities.py,sha256=yOepGaNBGfZJEsMiLRwKauvsmaHn_JRrxaGp8LmKAXs,7778
|
|
46
|
+
smftools/informatics/helpers/canoncall.py,sha256=5WS6lwukc_xYTdPQy0OSj-WLbx0Rg70Cun1lCucY7w8,1741
|
|
47
|
+
smftools/informatics/helpers/complement_base_list.py,sha256=k6EkLtxFoajaIufxw1p0pShJ2nPHyGLTbzZmIFFjB4o,532
|
|
48
|
+
smftools/informatics/helpers/concatenate_fastqs_to_bam.py,sha256=0jy4H1ORuqaarsznv9tS1SM8CCRjaaD20NMknNvQPv0,16212
|
|
49
|
+
smftools/informatics/helpers/converted_BAM_to_adata.py,sha256=sRmOtn0kNosLYfogqslDHg1Azk51l6nfNOLgQOnQjlA,14591
|
|
50
|
+
smftools/informatics/helpers/converted_BAM_to_adata_II.py,sha256=9Tz-qWtK9v1DTlK6yManvhOlMcaHFQUmmrRZQ5eiECw,22229
|
|
51
|
+
smftools/informatics/helpers/count_aligned_reads.py,sha256=uYyUYglF1asiaoxr-LKxPMUEbfyD7FS-dumTg2hJHzQ,2170
|
|
52
|
+
smftools/informatics/helpers/demux_and_index_BAM.py,sha256=2B_UiU05ln3gYvcN9aC_w6qs8j_WAF4pHWZekAYsXm4,2114
|
|
53
|
+
smftools/informatics/helpers/discover_input_files.py,sha256=hUes2iKBQW_sVmAYD-1JnLD9Ub-COEHzrRKWNFipl0g,3725
|
|
54
|
+
smftools/informatics/helpers/extract_base_identities.py,sha256=2yvr5uff9ah0jylFjNMt7oRJb1z_YdhvM6htSxI0frg,3038
|
|
55
|
+
smftools/informatics/helpers/extract_mods.py,sha256=MbSIiyj3zx7WlSSWMRPriLMkBtxYc1EWZiAAirMVgqA,3865
|
|
56
|
+
smftools/informatics/helpers/extract_read_features_from_bam.py,sha256=SYAb4Q1HxiJzCx5bIz86MdH_TvVPsRAVodZD9082HGY,1491
|
|
57
|
+
smftools/informatics/helpers/extract_read_lengths_from_bed.py,sha256=Cw39wgp1eBTV45Wk1l0c9l-upBW5N2OcgyWXTAXln90,678
|
|
58
|
+
smftools/informatics/helpers/extract_readnames_from_BAM.py,sha256=3FxSNqbZ1VsOK2RfHrvevQTzhWATf5E8bZ5yVOqayvk,759
|
|
59
|
+
smftools/informatics/helpers/find_conversion_sites.py,sha256=JPlDipmzeCBkV_T6esGD5ptwmbQmk8gJMTh7NMaSYd4,2480
|
|
60
|
+
smftools/informatics/helpers/generate_converted_FASTA.py,sha256=UniQfERNt4FC5L8T1tzr4cLQOJc3wMBPhuWmC-lC8Fs,3747
|
|
61
|
+
smftools/informatics/helpers/get_chromosome_lengths.py,sha256=sLumLrGsU_Xg_oJcdOpQyjUGpJoT2HbcmxWwbwzXUlE,1036
|
|
62
|
+
smftools/informatics/helpers/get_native_references.py,sha256=fRuyEm9UJkfd5DwHmFb1bxEtNvtSI1_BxGRmrCymGkw,981
|
|
63
|
+
smftools/informatics/helpers/index_fasta.py,sha256=N3IErfSiavYldeaat8xcQgA1MpykoQHcE0gHUeWuClE,267
|
|
64
|
+
smftools/informatics/helpers/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
|
|
65
|
+
smftools/informatics/helpers/make_modbed.py,sha256=cOQ97gPfRiCcw_fqboxousXIiOYjp78IFYLbu749U1Y,939
|
|
66
|
+
smftools/informatics/helpers/modQC.py,sha256=LeOBObG8gAVVdgESIMceYhd5AW1gfN7ABo91OQtOzTM,1041
|
|
67
|
+
smftools/informatics/helpers/modcall.py,sha256=LVPrdMNVp2gyQTJ4BNp8NJNm89AueDjsKaY7Gqkluho,1777
|
|
68
|
+
smftools/informatics/helpers/modkit_extract_to_adata.py,sha256=yjG_Onh6YgwpE11I8mgQyN6F-8yStJgvDcI38L13i4A,52098
|
|
69
|
+
smftools/informatics/helpers/ohe_batching.py,sha256=QVOiyl9fYHNIFWM23afYnQo0uaOjf1NR3ASKGVSrmuw,2975
|
|
70
|
+
smftools/informatics/helpers/ohe_layers_decode.py,sha256=gIgUC9L8TFLi-fTnjR4PRzXdUaH5D6WL2Hump6XOoy0,1042
|
|
71
|
+
smftools/informatics/helpers/one_hot_decode.py,sha256=3n4rzY8_aC9YKmgrftsguMsH7fUyQ-DbWmrOYF6la9s,906
|
|
72
|
+
smftools/informatics/helpers/one_hot_encode.py,sha256=5hHigA6-SZLK84WH_RHo06F_6aTg7S3TJgvSr8gxGX8,1968
|
|
73
|
+
smftools/informatics/helpers/plot_bed_histograms.py,sha256=sdtz_ieU_5rz8WyfAzjxbzY_w8kLdE_Rklvjax1hl3Q,10442
|
|
74
|
+
smftools/informatics/helpers/run_multiqc.py,sha256=qkw48DeBdTEqzhKFGjMUlvNmTehp8wRPkcxdkwERkHc,980
|
|
75
|
+
smftools/informatics/helpers/separate_bam_by_bc.py,sha256=WJZwKCYODUvzFaVWwX3SUE8sxEXmeYmSi7Dl9h2J2EY,1802
|
|
76
|
+
smftools/informatics/helpers/split_and_index_BAM.py,sha256=yowMusTGoC7uRD0jAwOHzBegX6MV7f-uY-XSzkX5cBw,1253
|
|
77
|
+
smftools/informatics/helpers/archived/informatics.py,sha256=gKb2ZJ_LcAeEXuQqn9e-QDF_sS4tMpMTr2vZlqa7n54,14572
|
|
78
|
+
smftools/informatics/helpers/archived/load_adata.py,sha256=DhvYYqO9VLsZqhL1WjN9sd-e3fgvdXGlgTP18z1h0L0,33654
|
|
79
|
+
smftools/machine_learning/__init__.py,sha256=cWyGN_QVcssqBr_VVr7xh2Inz0P7ylqUmBBcpMgsK0k,257
|
|
80
|
+
smftools/machine_learning/data/__init__.py,sha256=xbfLE-gNjdgxvZ9LKTdvjAtbIHOcs2TR0Gz3YRFbo38,113
|
|
81
|
+
smftools/machine_learning/data/anndata_data_module.py,sha256=ktrdMVMk5yhIUrnu-G_Xf3y7G-KP9PyhYZhobv8TCVg,10063
|
|
82
|
+
smftools/machine_learning/data/preprocessing.py,sha256=dSs6Qs3wmlccFPZSpOc-uy1nlFSf68wWQKwF1iTqMok,137
|
|
83
|
+
smftools/machine_learning/evaluation/__init__.py,sha256=KHvcC7bTYv-ThptAi6G8wD-hW5Iz1HPgMcQ3AewtK3c,122
|
|
84
|
+
smftools/machine_learning/evaluation/eval_utils.py,sha256=t9WIevIJ6b6HqU6OYaNx7UBAa5TEIPFmZow6n_ZDZeY,1105
|
|
85
|
+
smftools/machine_learning/evaluation/evaluators.py,sha256=KqYHqbVV2WOs0Yo4GIhLS_0h1oKY6nd1yi6piDWYQLg,8184
|
|
86
|
+
smftools/machine_learning/inference/__init__.py,sha256=vWLQD-JNEKKNGuzDtx7vcE4czKKXEO6S-0Zp5-21fPs,172
|
|
87
|
+
smftools/machine_learning/inference/inference_utils.py,sha256=aJuXvTgC8v4BOjLCgOU9vT3S2y1UGoZjq4mQpPswTQU,947
|
|
88
|
+
smftools/machine_learning/inference/lightning_inference.py,sha256=34WVnPfpPDf4KM8ZN5MOsx4tYgsrUclkens6GXgB4Ek,2160
|
|
89
|
+
smftools/machine_learning/inference/sklearn_inference.py,sha256=FomgQF5jFBfAj1-H2Q0_RPmvR9rDJsmUeaWOVRhbpTw,1612
|
|
90
|
+
smftools/machine_learning/inference/sliding_window_inference.py,sha256=8zjQs2hGhj0Dww4gWljLVK0g002_U96dyIqQJiDdSDY,4426
|
|
91
|
+
smftools/machine_learning/models/__init__.py,sha256=bMfPbQ5bDmn_kWv82virLuUhjb12Yow7t_j96afNbyA,421
|
|
92
|
+
smftools/machine_learning/models/base.py,sha256=p3d77iyY8BVx0tYL0TjmOSnPNP1ZrKTzn_J05e2GF0A,9626
|
|
93
|
+
smftools/machine_learning/models/cnn.py,sha256=KKZmJLQ6Bjm_HI8GULnafjz6mRy5BZ6Y0ZCgDSuS268,4465
|
|
94
|
+
smftools/machine_learning/models/lightning_base.py,sha256=3nC3wajPIupFMtOq3YUf24_SHvDoW_9BIGyIvEwzN9w,13626
|
|
95
|
+
smftools/machine_learning/models/mlp.py,sha256=Y2hc_qHj6vpM_mHpreFxBULn4MkR25oEA1LXu5sPA_w,820
|
|
96
|
+
smftools/machine_learning/models/positional.py,sha256=EfTyYnY0pCB-aVJIWf-4DVNpyGlvx1q_09PzfrC-VlA,652
|
|
97
|
+
smftools/machine_learning/models/rnn.py,sha256=uJnHDGpT2_l_HqHGsx33XGF3v3EYZPeOtSQ89uvhdpE,717
|
|
98
|
+
smftools/machine_learning/models/sklearn_models.py,sha256=ssV-mR3rmcjycQEzKccRcbVaEjZp0zRNUL5-R6m1UKU,10402
|
|
99
|
+
smftools/machine_learning/models/transformer.py,sha256=8YXS0vCcOWT-33h-8yeDfFM5ibPHQ-CMSEhGWzR4pm8,11039
|
|
100
|
+
smftools/machine_learning/models/wrappers.py,sha256=HEY2A6-Bk6MtVZ9jOaPT8S1Qi0L98SyEg1nbKqYZoag,697
|
|
101
|
+
smftools/machine_learning/training/__init__.py,sha256=teUmwpnmAl0oNFaqVrfoijEpxBjLwI5YtBwLHT3uXck,185
|
|
102
|
+
smftools/machine_learning/training/train_lightning_model.py,sha256=usEBaQ4vNjfatefP5XDCXkywzgZ2D-YppGmT3-3gTGE,4070
|
|
103
|
+
smftools/machine_learning/training/train_sklearn_model.py,sha256=m1k1Gsynpj6SJI64rl4B3cfXm1SliU0fwMAj1-bAAeE,3166
|
|
104
|
+
smftools/machine_learning/utils/__init__.py,sha256=yOpzBc9AXbarSRfN8Ixh2Z1uWLGpgpjRR46h6E46_2w,62
|
|
105
|
+
smftools/machine_learning/utils/device.py,sha256=GITrULOty2Fr96Bqt1wi1PaYl_oVgB5Z99Gfn5vQy4o,274
|
|
106
|
+
smftools/machine_learning/utils/grl.py,sha256=BWBDp_kQBigrUzQpRbZzgpfr_WOcd2K2V3MQL-aAIc4,334
|
|
107
|
+
smftools/plotting/__init__.py,sha256=7T3-hZFgTY0nfQgV4J6Vn9ogwkNMlY315kguZR7V1AI,866
|
|
108
|
+
smftools/plotting/autocorrelation_plotting.py,sha256=wdqQ4dawibgZeXRs_G4WterkqOgxHWrJlgZ4PdtO-OA,27456
|
|
109
|
+
smftools/plotting/classifiers.py,sha256=8_zabh4NNB1_yVxLD22lfrfl5yfzbEoG3XWqlIqdtrQ,13786
|
|
110
|
+
smftools/plotting/general_plotting.py,sha256=JOeF_lq2hCDt1Vgy8KYLKSzNj8SXGi3z6qMt2P68TDU,31458
|
|
111
|
+
smftools/plotting/hmm_plotting.py,sha256=3Eq82gty_0b8GkSMCQgUlbKfzR9h2fJ5rZkB8yYGX-M,10934
|
|
112
|
+
smftools/plotting/position_stats.py,sha256=4XukYIWeWZ_aGSZg1K0t37KA2aknjNNKT5kcKFfuz8Q,17428
|
|
113
|
+
smftools/plotting/qc_plotting.py,sha256=q5Ri0q89udvNUFUNxHzgk9atvQYqUkqkS5-JFq9EqoI,10045
|
|
114
|
+
smftools/preprocessing/__init__.py,sha256=VqhiwJg57m0ePCRAGfX3cJniNLV2jNJpoXZEM2j-0wU,1687
|
|
115
|
+
smftools/preprocessing/add_read_length_and_mapping_qc.py,sha256=zD_Kxw3DvyOypfuSMGv0ESyt-02w4XlAAMqQxb7yDNQ,5700
|
|
116
|
+
smftools/preprocessing/append_base_context.py,sha256=ohtdHNS1Y9ttLvhLKSwrOyar7HyU2Dw0Ach9WVx5QM8,6221
|
|
117
|
+
smftools/preprocessing/append_binary_layer_by_base_context.py,sha256=I3iiZkVqqB1KqSiA-s-ctl-ESkuTpd7Ot82M0xv_Cm4,6202
|
|
118
|
+
smftools/preprocessing/binarize_on_Youden.py,sha256=O5E3vFc2zXMfKW0p0JGDlmRKEx2_VP6dAqfvrumzz00,1797
|
|
119
|
+
smftools/preprocessing/binary_layers_to_ohe.py,sha256=Lxd8knelNTaUozfGMFNMlnrOb6uP28Laj3Ymw6cRHL0,1826
|
|
120
|
+
smftools/preprocessing/calculate_complexity.py,sha256=cXMpFrhkwkPipQo2GZGT5yFknMYUMt1t8gz0Cse1DrA,3288
|
|
121
|
+
smftools/preprocessing/calculate_complexity_II.py,sha256=DGfl0jkuBPUpzhKVItN0W7EPzh-QYuR4IxRObPE6gAQ,9301
|
|
122
|
+
smftools/preprocessing/calculate_consensus.py,sha256=6zRpRmb2xdfDu5hctZrReALRb7Pjn8sy8xJZTm3o0nU,2442
|
|
123
|
+
smftools/preprocessing/calculate_coverage.py,sha256=4WTILzKLzxGLSsQrZkshXP-IRQpoVu3Fkqc0QTpux3Y,2132
|
|
124
|
+
smftools/preprocessing/calculate_pairwise_differences.py,sha256=5zJbNNaFld5qgKRoPyplCmMHflbvAQ9eKWCXPXPpJ60,1774
|
|
125
|
+
smftools/preprocessing/calculate_pairwise_hamming_distances.py,sha256=e5Mzyex7pT29H2PY014uU4Fi_eewbut1JkzC1ffBbCg,961
|
|
126
|
+
smftools/preprocessing/calculate_position_Youden.py,sha256=9GY_WWwaxpB2Xppck3WT1zHtFOhTXrpuDIgbxLC9A7E,7450
|
|
127
|
+
smftools/preprocessing/calculate_read_length_stats.py,sha256=gNNePwMqYZJidzGgT1ZkfSlvc5Y3I3bi5KNYpP6wQQc,4584
|
|
128
|
+
smftools/preprocessing/calculate_read_modification_stats.py,sha256=fQYtwsGt6zq7QBlWtAEaFOkbV_4yXjrj9GnBryEEztc,4779
|
|
129
|
+
smftools/preprocessing/clean_NaN.py,sha256=IOcnN5YF05gpPQc3cc3IS83petCnhCpkYiyT6bXEyx0,1937
|
|
130
|
+
smftools/preprocessing/filter_adata_by_nan_proportion.py,sha256=GZcvr2JCsthX8EMw34S9-W3fc6JElw6ka99Jy6f2JvA,1292
|
|
131
|
+
smftools/preprocessing/filter_reads_on_length_quality_mapping.py,sha256=93LgTy_vsPnOZgoiXhZ1-w_pix2oFdBk-dsBUoz33Go,7379
|
|
132
|
+
smftools/preprocessing/filter_reads_on_modification_thresholds.py,sha256=wOmHhQj3xQALQdtQ4-v4POEOat5bEJa-BVmzEE_yrKA,19403
|
|
133
|
+
smftools/preprocessing/flag_duplicate_reads.py,sha256=D7KrDuyy_TSgGvB5aRRmY01k36p92n48YEwmwsUd3IY,65595
|
|
134
|
+
smftools/preprocessing/invert_adata.py,sha256=HYMJ1sR3Ui8j6bDjY8OcVQOETzZV-_rrpIYaWLZL6S4,1049
|
|
135
|
+
smftools/preprocessing/load_sample_sheet.py,sha256=AjJf2MrqGHJJ2rNjYi09zV1QkLTq8qGaHGVklXHnPuU,1908
|
|
136
|
+
smftools/preprocessing/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
|
|
137
|
+
smftools/preprocessing/min_non_diagonal.py,sha256=hx1asW8CEmLaIroZISW8EcAf_RnBEC_nofGD8QG0b1E,711
|
|
138
|
+
smftools/preprocessing/recipes.py,sha256=cfKEpKW8TtQLe1CMdSHyPuIgKiWOPn7uP6uMIoRlnaQ,7063
|
|
139
|
+
smftools/preprocessing/subsample_adata.py,sha256=ivJvJIOvEtyvAjqZ7cwEeVedm4QgJxCJEI7sFaTuI3w,2360
|
|
140
|
+
smftools/preprocessing/archives/mark_duplicates.py,sha256=kwfstcWb7KkqeNB321dB-NLe8yd9_hZsSmpL8pCVBQg,8747
|
|
141
|
+
smftools/preprocessing/archives/preprocessing.py,sha256=4mLT09A7vwRZ78FHmuwtv38mH9TQ9qrZc_WjHRhhkIw,34379
|
|
142
|
+
smftools/preprocessing/archives/remove_duplicates.py,sha256=Erooi5_1VOUNfWpzddzmMNYMCl1U1jJryt7ZtMhabAs,699
|
|
143
|
+
smftools/tools/__init__.py,sha256=QV3asy5_lP9wcRzpNTfxGTCcpykkbNYvzxSMpFw4KXU,719
|
|
144
|
+
smftools/tools/calculate_umap.py,sha256=2arbAQdFOtnWoPq22TWicyr6fLYZ5PTNeZv_jdwuk_I,2491
|
|
145
|
+
smftools/tools/cluster_adata_on_methylation.py,sha256=UDC5lpW8fZ6O-16ETu-mbflLkNBKuIg7RIzQ9r7knvA,5760
|
|
146
|
+
smftools/tools/general_tools.py,sha256=YbobB6Zllz6cUq50yolGH9Jr6uuAMvEI4m3hiJ6FmAI,2561
|
|
147
|
+
smftools/tools/position_stats.py,sha256=Z7VW54wUVzH1RQ9xhP6KO7ewp-xeLybd07I5umV_aqM,24369
|
|
148
|
+
smftools/tools/read_stats.py,sha256=w3Zaim6l__Kt8EPCJKXTlMgO51Iy2Milj6yUb88HXiI,6324
|
|
149
|
+
smftools/tools/spatial_autocorrelation.py,sha256=uQkuPi2PJCj5lZzb33IWTL-e-p3J6PdMeM88rUFfQRw,21212
|
|
150
|
+
smftools/tools/subset_adata.py,sha256=nBbtAxCNteZCUBmPnZ9swQNyU74XgWM8aJHHWg2AuL0,1025
|
|
151
|
+
smftools/tools/archived/apply_hmm.py,sha256=pJXCULay0zbmubrwql368y7yiHAZr2bJhuGx2QUuKnE,9321
|
|
152
|
+
smftools/tools/archived/classifiers.py,sha256=mwSTpWUXBPjmUuV5i_SMG1lIPpHSMCzsKhl8wTbm-Og,36903
|
|
153
|
+
smftools/tools/archived/classify_methylated_features.py,sha256=Z0N2UKw3luD3CTQ8wcUvdnMY7w-8574OJbEcwzNsy88,2897
|
|
154
|
+
smftools/tools/archived/classify_non_methylated_features.py,sha256=IJERTozEs7IPL7K-VIjq2q2K36wRCW9iiNSYLAXasrA,3256
|
|
155
|
+
smftools/tools/archived/subset_adata_v1.py,sha256=qyU9iCal03edb5aUS3AZ2U4TlL3uQ42jGI9hX3QF7Fc,1047
|
|
156
|
+
smftools/tools/archived/subset_adata_v2.py,sha256=OKZoUpvdURPtckIQxGTWmOI5jLa-_EU62Xs3LyyehnA,1880
|
|
157
|
+
smftools-0.2.1.dist-info/METADATA,sha256=MXyiJbt1w_Ln4ENxQNbLU0JWwE-S6z1oNZkd8gkf3J8,8958
|
|
158
|
+
smftools-0.2.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
159
|
+
smftools-0.2.1.dist-info/entry_points.txt,sha256=NflK6zRv2zlvnjCnDSHycp9w9CczHLfGz9zAc4FtI0I,46
|
|
160
|
+
smftools-0.2.1.dist-info/licenses/LICENSE,sha256=F8LwmL6vMPddaCt1z1S83Kh_OZv50alTlY7BvVx1RXw,1066
|
|
161
|
+
smftools-0.2.1.dist-info/RECORD,,
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
## LoadExperimentConfig
|
|
2
|
-
|
|
3
|
-
class LoadExperimentConfig:
|
|
4
|
-
"""
|
|
5
|
-
Loads in the experiment configuration csv and saves global variables with experiment configuration parameters.
|
|
6
|
-
Parameters:
|
|
7
|
-
experiment_config (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
-
|
|
9
|
-
Attributes:
|
|
10
|
-
var_dict (dict): A dictionary containing experiment configuration parameters.
|
|
11
|
-
|
|
12
|
-
Example:
|
|
13
|
-
>>> import pandas as pd
|
|
14
|
-
>>> from io import StringIO
|
|
15
|
-
>>> csv_data = '''variable,value,type
|
|
16
|
-
... mapping_threshold,0.05,float
|
|
17
|
-
... batch_size,4,int
|
|
18
|
-
... testing_bool,True,bool
|
|
19
|
-
... strands,"[bottom, top]",list
|
|
20
|
-
... split_dir,split_bams,string
|
|
21
|
-
... pod5_dir,None,string
|
|
22
|
-
... pod5_dir,,string
|
|
23
|
-
... '''
|
|
24
|
-
>>> csv_file = StringIO(csv_data)
|
|
25
|
-
>>> df = pd.read_csv(csv_file)
|
|
26
|
-
>>> df.to_csv('test_config.csv', index=False)
|
|
27
|
-
>>> config_loader = LoadExperimentConfig('test_config.csv')
|
|
28
|
-
>>> config_loader.var_dict['mapping_threshold']
|
|
29
|
-
0.05
|
|
30
|
-
>>> config_loader.var_dict['batch_size']
|
|
31
|
-
4
|
|
32
|
-
>>> config_loader.var_dict['testing_bool']
|
|
33
|
-
True
|
|
34
|
-
>>> config_loader.var_dict['strands']
|
|
35
|
-
['bottom', 'top']
|
|
36
|
-
>>> config_loader.var_dict['split_dir']
|
|
37
|
-
'split_bams'
|
|
38
|
-
>>> config_loader.var_dict['pod5_dir'] is None
|
|
39
|
-
True
|
|
40
|
-
>>> config_loader.var_dict['pod5_dir'] is None
|
|
41
|
-
True
|
|
42
|
-
"""
|
|
43
|
-
def __init__(self, experiment_config):
|
|
44
|
-
import pandas as pd
|
|
45
|
-
print(f"Loading experiment config from {experiment_config}")
|
|
46
|
-
# Read the CSV into a pandas DataFrame
|
|
47
|
-
df = pd.read_csv(experiment_config)
|
|
48
|
-
# Initialize an empty dictionary to store variables
|
|
49
|
-
var_dict = {}
|
|
50
|
-
# Iterate through each row in the DataFrame
|
|
51
|
-
for _, row in df.iterrows():
|
|
52
|
-
var_name = str(row['variable'])
|
|
53
|
-
value = row['value']
|
|
54
|
-
dtype = row['type']
|
|
55
|
-
# Handle empty and None values
|
|
56
|
-
if pd.isna(value) or value in ['None', '']:
|
|
57
|
-
value = None
|
|
58
|
-
else:
|
|
59
|
-
# Handle different data types
|
|
60
|
-
if dtype == 'list':
|
|
61
|
-
# Convert the string representation of a list to an actual list
|
|
62
|
-
value = value.strip('()[]').replace(', ', ',').split(',')
|
|
63
|
-
elif dtype == 'int':
|
|
64
|
-
value = int(value)
|
|
65
|
-
elif dtype == 'float':
|
|
66
|
-
value = float(value)
|
|
67
|
-
elif dtype == 'bool':
|
|
68
|
-
value = value.lower() == 'true'
|
|
69
|
-
elif dtype == 'string':
|
|
70
|
-
value = str(value)
|
|
71
|
-
# Store the variable in the dictionary
|
|
72
|
-
var_dict[var_name] = value
|
|
73
|
-
# Save the dictionary as an attribute of the class
|
|
74
|
-
self.var_dict = var_dict
|
|
75
|
-
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
# plot_read_length_and_coverage_histograms
|
|
2
|
-
|
|
3
|
-
def plot_read_length_and_coverage_histograms(bed_file, plotting_directory):
|
|
4
|
-
"""
|
|
5
|
-
Plots read length and coverage statistics for each record.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
bed_file (str): Path to the bed file to derive read lengths and coverage from.
|
|
9
|
-
plot_directory (str): Path to the directory to write out historgrams.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
import pandas as pd
|
|
15
|
-
import matplotlib.pyplot as plt
|
|
16
|
-
import numpy as np
|
|
17
|
-
import os
|
|
18
|
-
|
|
19
|
-
bed_basename = os.path.basename(bed_file).split('.bed')[0]
|
|
20
|
-
# Load the BED file into a DataFrame
|
|
21
|
-
print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
|
|
22
|
-
df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name'])
|
|
23
|
-
|
|
24
|
-
# Group by chromosome
|
|
25
|
-
grouped = df.groupby('chromosome')
|
|
26
|
-
|
|
27
|
-
for chrom, group in grouped:
|
|
28
|
-
# Plot read length histogram
|
|
29
|
-
plt.figure(figsize=(12, 6))
|
|
30
|
-
plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
|
|
31
|
-
plt.title(f'Read Length Histogram of reads aligned to {chrom}')
|
|
32
|
-
plt.xlabel('Read Length')
|
|
33
|
-
plt.ylabel('Count')
|
|
34
|
-
plt.grid(True)
|
|
35
|
-
save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
|
|
36
|
-
plt.savefig(save_name)
|
|
37
|
-
plt.close()
|
|
38
|
-
|
|
39
|
-
# Compute coverage
|
|
40
|
-
coverage = np.zeros(group['end'].max())
|
|
41
|
-
for _, row in group.iterrows():
|
|
42
|
-
coverage[row['start']:row['end']] += 1
|
|
43
|
-
|
|
44
|
-
# Plot coverage histogram
|
|
45
|
-
plt.figure(figsize=(12, 6))
|
|
46
|
-
plt.plot(coverage, color='b')
|
|
47
|
-
plt.title(f'Coverage Histogram for {chrom}')
|
|
48
|
-
plt.xlabel('Position')
|
|
49
|
-
plt.ylabel('Coverage')
|
|
50
|
-
plt.grid(True)
|
|
51
|
-
save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
|
|
52
|
-
plt.savefig(save_name)
|
|
53
|
-
plt.close()
|
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
## load_adata
|
|
2
|
-
|
|
3
|
-
def load_adata(config_path):
|
|
4
|
-
"""
|
|
5
|
-
High-level function to call for converting raw sequencing data to an adata object.
|
|
6
|
-
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
7
|
-
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
8
|
-
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
9
|
-
|
|
10
|
-
Parameters:
|
|
11
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
# Lazy importing of packages
|
|
17
|
-
from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam, extract_read_features_from_bam
|
|
18
|
-
from .fast5_to_pod5 import fast5_to_pod5
|
|
19
|
-
from .subsample_fasta_from_bed import subsample_fasta_from_bed
|
|
20
|
-
import os
|
|
21
|
-
import numpy as np
|
|
22
|
-
import anndata as ad
|
|
23
|
-
from pathlib import Path
|
|
24
|
-
|
|
25
|
-
# Default params
|
|
26
|
-
bam_suffix = '.bam' # If different, change from here.
|
|
27
|
-
split_dir = 'demultiplexed_BAMs' # If different, change from here.
|
|
28
|
-
strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
|
|
29
|
-
conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
|
|
30
|
-
|
|
31
|
-
# Load experiment config parameters into global variables
|
|
32
|
-
experiment_config = LoadExperimentConfig(config_path)
|
|
33
|
-
var_dict = experiment_config.var_dict
|
|
34
|
-
|
|
35
|
-
# These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
|
|
36
|
-
default_value = None
|
|
37
|
-
|
|
38
|
-
# General config variable init
|
|
39
|
-
smf_modality = var_dict.get('smf_modality', default_value) # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Necessary.
|
|
40
|
-
input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
41
|
-
output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
|
|
42
|
-
fasta = var_dict.get('fasta', default_value) # Path to reference FASTA.
|
|
43
|
-
fasta_regions_of_interest = var_dict.get("fasta_regions_of_interest", default_value) # Path to a bed file listing coordinate regions of interest within the FASTA to include. Optional.
|
|
44
|
-
mapping_threshold = var_dict.get('mapping_threshold', default_value) # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
45
|
-
experiment_name = var_dict.get('experiment_name', default_value) # A key term to add to the AnnData file name.
|
|
46
|
-
model_dir = var_dict.get('model_dir', default_value) # needed for dorado basecaller
|
|
47
|
-
model = var_dict.get('model', default_value) # needed for dorado basecaller
|
|
48
|
-
barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
|
|
49
|
-
barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
|
|
50
|
-
trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
|
|
51
|
-
input_already_demuxed = var_dict.get('input_already_demuxed', default_value) # If the input files are already demultiplexed.
|
|
52
|
-
threads = var_dict.get('threads', default_value) # number of cpu threads available for multiprocessing
|
|
53
|
-
# Conversion specific variable init
|
|
54
|
-
conversion_types = var_dict.get('conversion_types', default_value)
|
|
55
|
-
# Direct methylation specific variable init
|
|
56
|
-
filter_threshold = var_dict.get('filter_threshold', default_value)
|
|
57
|
-
m6A_threshold = var_dict.get('m6A_threshold', default_value)
|
|
58
|
-
m5C_threshold = var_dict.get('m5C_threshold', default_value)
|
|
59
|
-
hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
|
|
60
|
-
thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
|
|
61
|
-
mod_list = var_dict.get('mod_list', default_value)
|
|
62
|
-
batch_size = var_dict.get('batch_size', default_value)
|
|
63
|
-
device = var_dict.get('device', 'auto')
|
|
64
|
-
make_bigwigs = var_dict.get('make_bigwigs', default_value)
|
|
65
|
-
skip_unclassified = var_dict.get('skip_unclassified', True)
|
|
66
|
-
delete_batch_hdfs = var_dict.get('delete_batch_hdfs', True)
|
|
67
|
-
|
|
68
|
-
# Make initial output directory
|
|
69
|
-
make_dirs([output_directory])
|
|
70
|
-
os.chdir(output_directory)
|
|
71
|
-
# Define the pathname to split BAMs into later during demultiplexing.
|
|
72
|
-
split_path = os.path.join(output_directory, split_dir)
|
|
73
|
-
|
|
74
|
-
# If fasta_regions_of_interest is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
75
|
-
if fasta_regions_of_interest and '.bed' in fasta_regions_of_interest:
|
|
76
|
-
fasta_basename = os.path.basename(fasta).split('.fa')[0]
|
|
77
|
-
bed_basename_minus_suffix = os.path.basename(fasta_regions_of_interest).split('.bed')[0]
|
|
78
|
-
output_FASTA = fasta_basename + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta'
|
|
79
|
-
subsample_fasta_from_bed(fasta, fasta_regions_of_interest, output_directory, output_FASTA)
|
|
80
|
-
fasta = os.path.join(output_directory, output_FASTA)
|
|
81
|
-
|
|
82
|
-
# If conversion_types is passed:
|
|
83
|
-
if conversion_types:
|
|
84
|
-
conversions += conversion_types
|
|
85
|
-
|
|
86
|
-
# Get the input filetype
|
|
87
|
-
if Path(input_data_path).is_file():
|
|
88
|
-
input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
|
|
89
|
-
input_is_pod5 = input_data_filetype in ['.pod5','.p5']
|
|
90
|
-
input_is_fast5 = input_data_filetype in ['.fast5','.f5']
|
|
91
|
-
input_is_fastq = input_data_filetype in ['.fastq', '.fq']
|
|
92
|
-
input_is_bam = input_data_filetype == bam_suffix
|
|
93
|
-
if input_is_fastq:
|
|
94
|
-
fastq_paths = [input_data_path]
|
|
95
|
-
elif Path(input_data_path).is_dir():
|
|
96
|
-
# Get the file names in the input data dir
|
|
97
|
-
input_files = os.listdir(input_data_path)
|
|
98
|
-
input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
|
|
99
|
-
input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
|
|
100
|
-
input_is_fastq = sum([True for file in input_files if '.fastq' in file or '.fq' in file])
|
|
101
|
-
input_is_bam = sum([True for file in input_files if bam_suffix in file])
|
|
102
|
-
if input_is_fastq:
|
|
103
|
-
fastq_paths = [os.path.join(input_data_path, file) for file in input_files if '.fastq' in file or '.fq' in file]
|
|
104
|
-
|
|
105
|
-
# If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
|
|
106
|
-
if input_is_fast5 and not input_is_pod5:
|
|
107
|
-
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
108
|
-
output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
|
|
109
|
-
print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
|
|
110
|
-
fast5_to_pod5(input_data_path, output_pod5)
|
|
111
|
-
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
112
|
-
input_data_path = output_pod5
|
|
113
|
-
input_is_pod5 = True
|
|
114
|
-
input_is_fast5 = False
|
|
115
|
-
|
|
116
|
-
elif input_is_fastq:
|
|
117
|
-
output_bam = os.path.join(output_directory, 'FASTQs_concatenated_into_BAM.bam')
|
|
118
|
-
concatenate_fastqs_to_bam(fastq_paths, output_bam, barcode_tag='BC', gzip_suffix='.gz')
|
|
119
|
-
input_data_path = output_bam
|
|
120
|
-
input_is_bam = True
|
|
121
|
-
input_is_fastq = False
|
|
122
|
-
|
|
123
|
-
if input_is_pod5:
|
|
124
|
-
basecall = True
|
|
125
|
-
elif input_is_bam:
|
|
126
|
-
basecall = False
|
|
127
|
-
else:
|
|
128
|
-
print('Error, can not find input bam or pod5')
|
|
129
|
-
|
|
130
|
-
if smf_modality == 'conversion':
|
|
131
|
-
from .conversion_smf import conversion_smf
|
|
132
|
-
final_adata, final_adata_path, sorted_output, bam_files = conversion_smf(fasta, output_directory, conversions, strands, model_dir, model, input_data_path, split_path
|
|
133
|
-
, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed)
|
|
134
|
-
elif smf_modality == 'direct':
|
|
135
|
-
from .direct_smf import direct_smf
|
|
136
|
-
# need to add input_already_demuxed workflow here.
|
|
137
|
-
final_adata, final_adata_path, sorted_output, bam_files = direct_smf(fasta, output_directory, mod_list,model_dir, model, thresholds, input_data_path, split_path
|
|
138
|
-
, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads)
|
|
139
|
-
else:
|
|
140
|
-
print("Error")
|
|
141
|
-
|
|
142
|
-
# Read in the final adata object and append final metadata
|
|
143
|
-
#print(f'Reading in adata from {final_adata_path} to add final metadata')
|
|
144
|
-
# final_adata = ad.read_h5ad(final_adata_path)
|
|
145
|
-
|
|
146
|
-
# Adding read query length metadata to adata object.
|
|
147
|
-
read_metrics = {}
|
|
148
|
-
for bam_file in bam_files:
|
|
149
|
-
bam_read_metrics = extract_read_features_from_bam(bam_file)
|
|
150
|
-
read_metrics.update(bam_read_metrics)
|
|
151
|
-
#read_metrics = extract_read_features_from_bam(sorted_output)
|
|
152
|
-
|
|
153
|
-
query_read_length_values = []
|
|
154
|
-
query_read_quality_values = []
|
|
155
|
-
reference_lengths = []
|
|
156
|
-
# Iterate over each row of the AnnData object
|
|
157
|
-
for obs_name in final_adata.obs_names:
|
|
158
|
-
# Fetch the value from the dictionary using the obs_name as the key
|
|
159
|
-
value = read_metrics.get(obs_name, np.nan) # Use np.nan if the key is not found
|
|
160
|
-
if type(value) is list:
|
|
161
|
-
query_read_length_values.append(value[0])
|
|
162
|
-
query_read_quality_values.append(value[1])
|
|
163
|
-
reference_lengths.append(value[2])
|
|
164
|
-
else:
|
|
165
|
-
query_read_length_values.append(value)
|
|
166
|
-
query_read_quality_values.append(value)
|
|
167
|
-
reference_lengths.append(value)
|
|
168
|
-
|
|
169
|
-
# Add the new column to adata.obs
|
|
170
|
-
final_adata.obs['query_read_length'] = query_read_length_values
|
|
171
|
-
final_adata.obs['query_read_quality'] = query_read_quality_values
|
|
172
|
-
final_adata.obs['query_length_to_reference_length_ratio'] = np.array(query_read_length_values) / np.array(reference_lengths)
|
|
173
|
-
|
|
174
|
-
final_adata.obs['Raw_methylation_signal'] = np.nansum(final_adata.X, axis=1)
|
|
175
|
-
final_adata.obs['Raw_per_base_methylation_average'] = final_adata.obs['Raw_methylation_signal'] / final_adata.obs['query_read_length']
|
|
176
|
-
|
|
177
|
-
print('Saving final adata')
|
|
178
|
-
if ".gz" in final_adata_path:
|
|
179
|
-
final_adata.write_h5ad(f"{final_adata_path}", compression='gzip')
|
|
180
|
-
else:
|
|
181
|
-
final_adata.write_h5ad(f"{final_adata_path}.gz", compression='gzip')
|
|
182
|
-
print('Final adata saved')
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
## append_C_context
|
|
2
|
-
|
|
3
|
-
## Conversion SMF Specific
|
|
4
|
-
# Read methylation QC
|
|
5
|
-
def append_C_context(adata, obs_column='Reference', use_consensus=False, native=False):
|
|
6
|
-
"""
|
|
7
|
-
Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
|
|
8
|
-
|
|
9
|
-
Parameters:
|
|
10
|
-
adata (AnnData): The input adata object.
|
|
11
|
-
obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
|
|
12
|
-
use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
|
|
13
|
-
native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
None
|
|
17
|
-
"""
|
|
18
|
-
import numpy as np
|
|
19
|
-
import anndata as ad
|
|
20
|
-
|
|
21
|
-
print('Adding Cytosine context based on reference FASTA sequence for sample')
|
|
22
|
-
|
|
23
|
-
site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C', 'any_C_site']
|
|
24
|
-
categories = adata.obs[obs_column].cat.categories
|
|
25
|
-
for cat in categories:
|
|
26
|
-
# Assess if the strand is the top or bottom strand converted
|
|
27
|
-
if 'top' in cat:
|
|
28
|
-
strand = 'top'
|
|
29
|
-
elif 'bottom' in cat:
|
|
30
|
-
strand = 'bottom'
|
|
31
|
-
|
|
32
|
-
if native:
|
|
33
|
-
basename = cat.split(f"_{strand}")[0]
|
|
34
|
-
if use_consensus:
|
|
35
|
-
sequence = adata.uns[f'{basename}_consensus_sequence']
|
|
36
|
-
else:
|
|
37
|
-
# This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
|
|
38
|
-
sequence = adata.uns[f'{basename}_FASTA_sequence']
|
|
39
|
-
else:
|
|
40
|
-
basename = cat.split(f"_{strand}")[0]
|
|
41
|
-
if use_consensus:
|
|
42
|
-
sequence = adata.uns[f'{basename}_consensus_sequence']
|
|
43
|
-
else:
|
|
44
|
-
# This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
|
|
45
|
-
sequence = adata.uns[f'{basename}_FASTA_sequence']
|
|
46
|
-
# Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
|
|
47
|
-
boolean_dict = {}
|
|
48
|
-
for site_type in site_types:
|
|
49
|
-
boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
|
|
50
|
-
|
|
51
|
-
if strand == 'top':
|
|
52
|
-
# Iterate through the sequence and apply the criteria
|
|
53
|
-
for i in range(1, len(sequence) - 1):
|
|
54
|
-
if sequence[i] == 'C':
|
|
55
|
-
boolean_dict[f'{cat}_any_C_site'][i] = True
|
|
56
|
-
if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
|
|
57
|
-
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
58
|
-
elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
|
|
59
|
-
boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
|
|
60
|
-
elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
|
|
61
|
-
boolean_dict[f'{cat}_CpG_site'][i] = True
|
|
62
|
-
elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
|
|
63
|
-
boolean_dict[f'{cat}_other_C'][i] = True
|
|
64
|
-
elif strand == 'bottom':
|
|
65
|
-
# Iterate through the sequence and apply the criteria
|
|
66
|
-
for i in range(1, len(sequence) - 1):
|
|
67
|
-
if sequence[i] == 'G':
|
|
68
|
-
boolean_dict[f'{cat}_any_C_site'][i] = True
|
|
69
|
-
if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
|
|
70
|
-
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
71
|
-
elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
|
|
72
|
-
boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
|
|
73
|
-
elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
|
|
74
|
-
boolean_dict[f'{cat}_CpG_site'][i] = True
|
|
75
|
-
elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
|
|
76
|
-
boolean_dict[f'{cat}_other_C'][i] = True
|
|
77
|
-
else:
|
|
78
|
-
print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
|
|
79
|
-
|
|
80
|
-
for site_type in site_types:
|
|
81
|
-
adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
|
|
82
|
-
adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
|