smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. smftools/_settings.py +3 -2
  2. smftools/_version.py +1 -1
  3. smftools/datasets/F1_sample_sheet.csv +5 -0
  4. smftools/datasets/datasets.py +8 -7
  5. smftools/informatics/__init__.py +7 -5
  6. smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
  7. smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
  8. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  9. smftools/informatics/conversion_smf.py +79 -0
  10. smftools/informatics/direct_smf.py +89 -0
  11. smftools/informatics/fast5_to_pod5.py +8 -6
  12. smftools/informatics/helpers/__init__.py +18 -0
  13. smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
  14. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  15. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  16. smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
  17. smftools/informatics/helpers/canoncall.py +2 -0
  18. smftools/informatics/helpers/complement_base_list.py +21 -0
  19. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  20. smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
  21. smftools/informatics/helpers/count_aligned_reads.py +13 -9
  22. smftools/informatics/helpers/extract_base_identities.py +34 -20
  23. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  24. smftools/informatics/helpers/find_conversion_sites.py +11 -9
  25. smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
  26. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  27. smftools/informatics/helpers/index_fasta.py +12 -0
  28. smftools/informatics/helpers/modcall.py +3 -1
  29. smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
  30. smftools/informatics/helpers/ohe_batching.py +52 -0
  31. smftools/informatics/helpers/one_hot_encode.py +10 -8
  32. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  33. smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
  34. smftools/informatics/helpers/split_and_index_BAM.py +16 -4
  35. smftools/informatics/load_adata.py +127 -0
  36. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  37. smftools/informatics/subsample_pod5.py +69 -13
  38. smftools/preprocessing/__init__.py +6 -1
  39. smftools/preprocessing/append_C_context.py +37 -14
  40. smftools/preprocessing/calculate_complexity.py +2 -2
  41. smftools/preprocessing/calculate_consensus.py +47 -0
  42. smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
  43. smftools/preprocessing/calculate_coverage.py +2 -2
  44. smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
  45. smftools/preprocessing/calculate_read_length_stats.py +56 -2
  46. smftools/preprocessing/clean_NaN.py +2 -2
  47. smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
  48. smftools/preprocessing/filter_reads_on_length.py +4 -2
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/preprocessing/load_sample_sheet.py +24 -0
  51. smftools/preprocessing/make_dirs.py +21 -0
  52. smftools/preprocessing/mark_duplicates.py +34 -19
  53. smftools/preprocessing/recipes.py +125 -0
  54. smftools/preprocessing/remove_duplicates.py +7 -4
  55. smftools/tools/apply_HMM.py +1 -0
  56. smftools/tools/cluster.py +0 -0
  57. smftools/tools/read_HMM.py +1 -0
  58. smftools/tools/subset_adata.py +32 -0
  59. smftools/tools/train_HMM.py +43 -0
  60. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
  61. smftools-0.1.3.dist-info/RECORD +84 -0
  62. smftools/informatics/basecalls_to_adata.py +0 -42
  63. smftools/informatics/pod5_conversion.py +0 -53
  64. smftools/informatics/pod5_direct.py +0 -55
  65. smftools/informatics/pod5_to_adata.py +0 -40
  66. smftools-0.1.1.dist-info/RECORD +0 -64
  67. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  68. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: smftools
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Single Molecule Footprinting Analysis in Python.
5
5
  Project-URL: Source, https://github.com/jkmckenna/smftools
6
+ Project-URL: Documentation, https://smftools.readthedocs.io/
6
7
  Author: Joseph McKenna
7
8
  Maintainer-email: Joseph McKenna <jkmckenna@berkeley.edu>
8
9
  License-Expression: MIT
@@ -31,6 +32,7 @@ Requires-Dist: numpy<2,>=1.22.0
31
32
  Requires-Dist: pandas>=1.4.2
32
33
  Requires-Dist: pod5>=0.1.21
33
34
  Requires-Dist: pomegranate>1.0.0
35
+ Requires-Dist: pyfaidx>=0.8.0
34
36
  Requires-Dist: pysam>=0.19.1
35
37
  Requires-Dist: scanpy>=1.9
36
38
  Requires-Dist: scikit-learn>=1.0.2
@@ -38,9 +40,6 @@ Requires-Dist: scipy>=1.7.3
38
40
  Requires-Dist: seaborn>=0.11
39
41
  Requires-Dist: torch>=1.9.0
40
42
  Requires-Dist: tqdm
41
- Provides-Extra: base-tests
42
- Requires-Dist: pytest; extra == 'base-tests'
43
- Requires-Dist: pytest-cov; extra == 'base-tests'
44
43
  Provides-Extra: docs
45
44
  Requires-Dist: ipython>=7.20; extra == 'docs'
46
45
  Requires-Dist: matplotlib!=3.6.1; extra == 'docs'
@@ -56,13 +55,16 @@ Requires-Dist: sphinx-design; extra == 'docs'
56
55
  Requires-Dist: sphinx>=7; extra == 'docs'
57
56
  Requires-Dist: sphinxcontrib-bibtex; extra == 'docs'
58
57
  Requires-Dist: sphinxext-opengraph; extra == 'docs'
58
+ Provides-Extra: tests
59
+ Requires-Dist: pytest; extra == 'tests'
60
+ Requires-Dist: pytest-cov; extra == 'tests'
59
61
  Description-Content-Type: text/markdown
60
62
 
61
63
  [![PyPI](https://img.shields.io/pypi/v/smftools.svg)](https://pypi.org/project/smftools)
62
64
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
63
65
 
64
66
  # smftools
65
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization. Data structures are compatible with analyses developed within the [scverse](https://github.com/scverse) project, including [scanpy](https://github.com/scverse/scanpy) and [scvi-tools](https://github.com/scverse/scvi-tools).
67
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
66
68
 
67
69
  ## Philosophy
68
70
  While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to at least 1 million X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
@@ -73,10 +75,14 @@ The following CLI tools need to be installed and configured before using the inf
73
75
  2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
74
76
  3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
75
77
  4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
78
+ 5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
79
+ 6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
76
80
 
77
81
  ## Modules
78
- - Informatics: Processes raw SMF data coming from Nanopore POD5 files, BAM files, or FASTQ files and organizes it into an AnnData object.
79
- - Preprocessing: Filters the AnnData object on read length, total methylation, and a variety of QC metrics.
82
+ ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
83
+ ![](docs/source/_static/smftools_informatics_diagram.png)
84
+ ### Preprocessing: Appends QC metrics to the AnnData object and perfroms filtering.
85
+ ![](docs/source/_static/smftools_preprocessing_diagram.png)
80
86
  - Tools: Appends various analyses to the AnnData object.
81
87
  - Plotting: Visualization of analyses stored within the AnnData object.
82
88
 
@@ -0,0 +1,84 @@
1
+ smftools/__init__.py,sha256=zy4ckT7hKrLrlm6NiZQoupvc6oSN7wJsyOBCYdzukcQ,401
2
+ smftools/_settings.py,sha256=Ed8lzKUA5ncq5ZRfSp0t6_rphEEjMxts6guttwTZP5Y,409
3
+ smftools/_version.py,sha256=R5TtpJu7Qu6sOarfDpp-5Oyy8Pi2Ir3VewCvsCQiAgo,21
4
+ smftools/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
5
+ smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz,sha256=q6wJtgFRDln0o20XNCx1qad3lwcdCoylqPN7wskTfI8,2926497
6
+ smftools/datasets/F1_sample_sheet.csv,sha256=9PodIIOXK2eamYPbC6DGnXdzgi9bRDovf296j1aM0ak,259
7
+ smftools/datasets/__init__.py,sha256=xkSTlPuakVYVCuRurif9BceNBDt6bsngJvvjI8757QI,142
8
+ smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz,sha256=niOcVHaYY7h3XyvwSkN-V_NMBaRt2vTP5TrJO0CwMCs,8385050
9
+ smftools/datasets/datasets.py,sha256=0y597Ntp707bOgDwN6O-JEt9yxgplj66p0aj6Zs_IB4,779
10
+ smftools/informatics/__init__.py,sha256=WQiMBr1yjDrlmHg8UNgW2MJsq4fPrVfh-UBr5tYI9x4,326
11
+ smftools/informatics/conversion_smf.py,sha256=PS-TjgMttr3VRrT0zg5L_L01xMOewB_OXSsQyoM7DWI,4333
12
+ smftools/informatics/direct_smf.py,sha256=ue7p7deuRwaZtEh9EFV1YTE8HKRAmOsx9oaRJdjCrbY,4697
13
+ smftools/informatics/fast5_to_pod5.py,sha256=xfdZU3QluaAcR-q2uBRz8hcBwYt73nCnrFeahvi0OKQ,704
14
+ smftools/informatics/load_adata.py,sha256=i-2YCSaeLzbPfNtKPrLwfkv-9u_TrTAZrbtNAj3FRWY,7271
15
+ smftools/informatics/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
16
+ smftools/informatics/subsample_fasta_from_bed.py,sha256=YqYV09rvEQdeiS5hTTrKa8xYmJfeM3Vk-UUqwpw0qBk,1983
17
+ smftools/informatics/subsample_pod5.py,sha256=zDw9tRcrFRmPI62xkcy9dh8IfsJcuYm7R-FVeBC_g3s,4701
18
+ smftools/informatics/archived/bam_conversion.py,sha256=I8EzXjQixMmqx2oWnoNSH5NURBhfT-krbWHkoi_M964,3330
19
+ smftools/informatics/archived/bam_direct.py,sha256=jbEFtUIiUR8Wlp3po_sWkr19AUNS9WZjglojb9j28vo,3606
20
+ smftools/informatics/archived/basecalls_to_adata.py,sha256=-Nag6lr_NAtU4t8jo0GSMdgIAIfmDge-5VEUPQbEatE,3692
21
+ smftools/informatics/helpers/LoadExperimentConfig.py,sha256=gsWGoa9cydwY4Kd-hTXF2gtmxc8glRRD2V1JB88e9js,2822
22
+ smftools/informatics/helpers/__init__.py,sha256=KrfyM08_RgDf3Ajvb4KNTvcOqZiWYSIVhEznCr01Gcc,2255
23
+ smftools/informatics/helpers/align_and_sort_BAM.py,sha256=DouG6nGWXtz2ulZD5p0sEShE-4dbPudHaWcHFm4-oJA,2184
24
+ smftools/informatics/helpers/aligned_BAM_to_bed.py,sha256=eYkGQFSM2gPEauASkY_-9Yvy6727vP8Q4wx_st85Dpc,2638
25
+ smftools/informatics/helpers/bed_to_bigwig.py,sha256=AazYEZzKgKgukSFwCpeiApzxh1kbt11X4RFqRIiBIaY,1466
26
+ smftools/informatics/helpers/binarize_converted_base_identities.py,sha256=iJlDah-YJ0zx0UrlHdtgvrALVNSA0TTTdDoKmNCVg0Q,1846
27
+ smftools/informatics/helpers/canoncall.py,sha256=M7HEqhYsWMUB0tLP3hzMM0L7PhcOTXgetl5lV3GgIaw,1062
28
+ smftools/informatics/helpers/complement_base_list.py,sha256=k6EkLtxFoajaIufxw1p0pShJ2nPHyGLTbzZmIFFjB4o,532
29
+ smftools/informatics/helpers/concatenate_fastqs_to_bam.py,sha256=RXPn7e6Dcwol9tnUsfXJu3EuZcMSOJJo5LNWouovvZs,2715
30
+ smftools/informatics/helpers/converted_BAM_to_adata.py,sha256=Rsnydzpf9lMS3TQjXpbXJSSfCzhVTPn3rBDLiK-8utA,13991
31
+ smftools/informatics/helpers/count_aligned_reads.py,sha256=uYyUYglF1asiaoxr-LKxPMUEbfyD7FS-dumTg2hJHzQ,2170
32
+ smftools/informatics/helpers/extract_base_identities.py,sha256=E-_m9W82N52NjX5kz9Af5YH0S2k58hnq9KTrm4S5vgM,4370
33
+ smftools/informatics/helpers/extract_mods.py,sha256=UBFjXDKz_A6ivjcocYT1_pKjvygY2Fdg0RjQmMS8UuA,2269
34
+ smftools/informatics/helpers/extract_readnames_from_BAM.py,sha256=3FxSNqbZ1VsOK2RfHrvevQTzhWATf5E8bZ5yVOqayvk,759
35
+ smftools/informatics/helpers/find_conversion_sites.py,sha256=5AghDQzEoSvE2Og98VsKoeWUFSLnIGY1LnRu1BtQavM,3700
36
+ smftools/informatics/helpers/generate_converted_FASTA.py,sha256=ueaAsFnBuc7zKwkBivBR3DJg4DtkxkHHIQcVVSWzv-w,5161
37
+ smftools/informatics/helpers/get_chromosome_lengths.py,sha256=sLumLrGsU_Xg_oJcdOpQyjUGpJoT2HbcmxWwbwzXUlE,1036
38
+ smftools/informatics/helpers/get_native_references.py,sha256=fRuyEm9UJkfd5DwHmFb1bxEtNvtSI1_BxGRmrCymGkw,981
39
+ smftools/informatics/helpers/index_fasta.py,sha256=N3IErfSiavYldeaat8xcQgA1MpykoQHcE0gHUeWuClE,267
40
+ smftools/informatics/helpers/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
41
+ smftools/informatics/helpers/make_modbed.py,sha256=cOQ97gPfRiCcw_fqboxousXIiOYjp78IFYLbu749U1Y,939
42
+ smftools/informatics/helpers/modQC.py,sha256=LeOBObG8gAVVdgESIMceYhd5AW1gfN7ABo91OQtOzTM,1041
43
+ smftools/informatics/helpers/modcall.py,sha256=9PH7Peq4y-VBqQcMkbv0TwgePBlD5aM4_FmI7H4hbQQ,1142
44
+ smftools/informatics/helpers/modkit_extract_to_adata.py,sha256=duPlRAIz4VWM-jm9iaLY7N6JHQcun_L0nhr2VyUjNTI,38184
45
+ smftools/informatics/helpers/ohe_batching.py,sha256=_Mz2p1We5PVIb8S6Hbq_hREKJ9mGQiADwfFK_NgMGhA,1909
46
+ smftools/informatics/helpers/one_hot_encode.py,sha256=hpZAuwa9ndkhyCm9sO65KVHE0lbFDKqRylfliEKyD4o,632
47
+ smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py,sha256=tAnXFleGzXJNjHRAgZ0NUJuZ0P3aKmUYIrK-V9VoJKY,1860
48
+ smftools/informatics/helpers/separate_bam_by_bc.py,sha256=Fsi8OEmv5Ny13cWoHVV9JmEjVFEXT_ZxbBOlRdmyPbE,1742
49
+ smftools/informatics/helpers/split_and_index_BAM.py,sha256=_TFJ8fcLbIf37JG83hSc1zgs1yxX70-NhA8y-PbhTpo,1966
50
+ smftools/informatics/helpers/archived/informatics.py,sha256=gKb2ZJ_LcAeEXuQqn9e-QDF_sS4tMpMTr2vZlqa7n54,14572
51
+ smftools/informatics/helpers/archived/load_adata.py,sha256=DhvYYqO9VLsZqhL1WjN9sd-e3fgvdXGlgTP18z1h0L0,33654
52
+ smftools/plotting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
+ smftools/preprocessing/__init__.py,sha256=5FQNrj51KmaDLeAGGBA8iWMkYiSOe7O91ES8mT4aVtE,1399
54
+ smftools/preprocessing/append_C_context.py,sha256=pP5u9o5U4JmHras0PK6yas65u4-U5KlX3sKLb-duo80,3728
55
+ smftools/preprocessing/binarize_on_Youden.py,sha256=slkkt56DZ1FZWy8Un5mNJEZ49JlPnPKow2zU4GoHEr8,2303
56
+ smftools/preprocessing/binary_layers_to_ohe.py,sha256=931eHuVda6pMZTvC7jVTKkY2a_KQWpSfgi-nkA5NmaI,1238
57
+ smftools/preprocessing/calculate_complexity.py,sha256=ut60et8bmIswtiLhctJWHNseIV4ZRQultYdtJPHcRPs,3224
58
+ smftools/preprocessing/calculate_consensus.py,sha256=6zRpRmb2xdfDu5hctZrReALRb7Pjn8sy8xJZTm3o0nU,2442
59
+ smftools/preprocessing/calculate_converted_read_methylation_stats.py,sha256=Si0DcES0lLMvg3XgdKpedxfPnXQ14tEFKrOAFRn3fHs,6059
60
+ smftools/preprocessing/calculate_coverage.py,sha256=ZgRxQGpydxQg1exkvSiy8nHmzDIPGGqL5vL9XQ2PZQ4,2068
61
+ smftools/preprocessing/calculate_pairwise_hamming_distances.py,sha256=e5Mzyex7pT29H2PY014uU4Fi_eewbut1JkzC1ffBbCg,961
62
+ smftools/preprocessing/calculate_position_Youden.py,sha256=mfQ6nFfUaEaKg_icyHA1zZlhh0wHjpLE56BZDXOdP_4,6364
63
+ smftools/preprocessing/calculate_read_length_stats.py,sha256=6m362JaCKlD0QoBUMnM2qsB6Jo_4shl7xFzqU1uZccU,4945
64
+ smftools/preprocessing/clean_NaN.py,sha256=1vieT026p0gDJCbqB_CiLvAGGxlc-5xufoKJgZuBFFk,1150
65
+ smftools/preprocessing/filter_converted_reads_on_methylation.py,sha256=SN5q0rqYtYW9j3i0sVSyTv9EmR_uLKI7GkjmJixeOU0,1307
66
+ smftools/preprocessing/filter_reads_on_length.py,sha256=sAT66bjuI8ZtXyQc9SuPzq1dPIB1CNVx6VfWqVng4Dg,2191
67
+ smftools/preprocessing/invert_adata.py,sha256=u6Y70EH0B5mXb9-HuukIlzpMgZ6rhzcJuy3YZZTx3SA,684
68
+ smftools/preprocessing/load_sample_sheet.py,sha256=uGjzG9x-1t_1lCooH85P8Tfg80GdvVx8Jv1LPl9XNFM,915
69
+ smftools/preprocessing/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
70
+ smftools/preprocessing/mark_duplicates.py,sha256=sQuPcTw8JsQoONOk-kMlAF965sIk2Pu-M7rIyfbyGGs,8145
71
+ smftools/preprocessing/min_non_diagonal.py,sha256=hx1asW8CEmLaIroZISW8EcAf_RnBEC_nofGD8QG0b1E,711
72
+ smftools/preprocessing/recipes.py,sha256=KzSw5JW0WJGzSis5Fm7moQY5PxOYl6-uYYf1NDj6nOE,7117
73
+ smftools/preprocessing/remove_duplicates.py,sha256=Erooi5_1VOUNfWpzddzmMNYMCl1U1jJryt7ZtMhabAs,699
74
+ smftools/preprocessing/archives/preprocessing.py,sha256=4mLT09A7vwRZ78FHmuwtv38mH9TQ9qrZc_WjHRhhkIw,34379
75
+ smftools/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
+ smftools/tools/apply_HMM.py,sha256=AuVtOki69-Xs4mhjhTXJzd49KCVXwixFyWSUgDjtR6s,11
77
+ smftools/tools/cluster.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
+ smftools/tools/read_HMM.py,sha256=N0MGG494VjlxYJcCVz1jN4OasGtRITZS98SJ2xB_j8k,10
79
+ smftools/tools/subset_adata.py,sha256=qyU9iCal03edb5aUS3AZ2U4TlL3uQ42jGI9hX3QF7Fc,1047
80
+ smftools/tools/train_HMM.py,sha256=x5ZcXj-heWQqDOX86nuuDoj1tPkYKl04fYA1fCKNQ0c,1380
81
+ smftools-0.1.3.dist-info/METADATA,sha256=u26Og8tpAF2TgXZztotk3Q4EuP7Fvf73s1tlIjBDD-A,6410
82
+ smftools-0.1.3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
83
+ smftools-0.1.3.dist-info/licenses/LICENSE,sha256=F8LwmL6vMPddaCt1z1S83Kh_OZv50alTlY7BvVx1RXw,1066
84
+ smftools-0.1.3.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- ## basecalls_to_adata
2
-
3
- def basecalls_to_adata(config_path):
4
- """
5
- High-level function to call for loading basecalled SMF data from a BAM file into an adata object. Also works with FASTQ for conversion SMF.
6
-
7
- Parameters:
8
- config_path (str): A string representing the file path to the experiment configuration csv file.
9
-
10
- Returns:
11
- None
12
- """
13
- from .helpers import LoadExperimentConfig, make_dirs
14
- import os
15
- bam_suffix = '.bam' # If different, change from here.
16
- split_dir = 'split_BAMs' # If different, change from here.
17
- strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
18
- conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
19
-
20
- # Load experiment config parameters into global variables
21
- experiment_config = LoadExperimentConfig(config_path)
22
- var_dict = experiment_config.var_dict
23
- for key, value in var_dict.items():
24
- globals()[key] = value
25
-
26
- split_path = os.path.join(output_directory, split_dir)
27
- make_dirs([output_directory, split_path])
28
- os.chdir(output_directory)
29
-
30
- conversions += conversion_types
31
-
32
- if smf_modality == 'conversion':
33
- from .bam_conversion import bam_conversion
34
- bam_conversion(fasta, output_directory, conversions, strands, basecalled_path, split_path, mapping_threshold, experiment_name, bam_suffix)
35
- elif smf_modality == 'direct':
36
- if bam_suffix in basecalled_path:
37
- from .bam_direct import bam_direct
38
- bam_direct(fasta, output_directory, mod_list, thresholds, basecalled_path, split_path, mapping_threshold, experiment_name, bam_suffix, batch_size)
39
- else:
40
- print('basecalls_to_adata function only work with the direct modality when the input filetype is BAM and not FASTQ.')
41
- else:
42
- print("Error")
@@ -1,53 +0,0 @@
1
- ## pod5_conversion
2
-
3
- def pod5_conversion(fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix):
4
- """
5
- Converts a POD5 file from a nanopore conversion SMF experiment to an adata object.
6
-
7
- Parameters:
8
- fasta (str): File path to the reference genome to align to.
9
- output_directory (str): A file path to the directory to output all the analyses.
10
- conversion_type (list): A list of strings of the conversion types to use in the analysis.
11
- strands (list): A list of converstion strands to use in the experiment.
12
- model (str): a string representing the file path to the dorado basecalling model.
13
- pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
14
- split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
- barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
- experiment_name (str): A string to provide an experiment name to the output adata file.
18
- bam_suffix (str): A suffix to add to the bam file.
19
-
20
- Returns:
21
- None
22
- """
23
- from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
24
- import os
25
- model_basename = os.path.basename(model)
26
- model_basename = model_basename.replace('.', '_')
27
- bam=f"{output_directory}/{model_basename}_canonical_basecalls"
28
- aligned_BAM=f"{bam}_aligned"
29
- aligned_sorted_BAM=f"{aligned_BAM}_sorted"
30
-
31
- os.chdir(output_directory)
32
-
33
- # 1) Convert FASTA file
34
- fasta_basename = os.path.basename(fasta)
35
- converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
36
- converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
37
- if os.path.exists(converted_FASTA):
38
- print(converted_FASTA + ' already exists. Using existing converted FASTA.')
39
- else:
40
- generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
41
-
42
- # 2) Basecall from the input POD5 to generate a singular output BAM
43
- canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix)
44
-
45
- # 3) Align the BAM to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
46
- input_BAM = bam + bam_suffix
47
- align_and_sort_BAM(converted_FASTA, input_BAM, bam_suffix, output_directory)
48
-
49
- ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
50
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
51
-
52
- # 5) Take the converted BAM and load it into an adata object.
53
- converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
@@ -1,55 +0,0 @@
1
- ## pod5_direct
2
-
3
- def pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size):
4
- """
5
- Converts a POD5 file from a nanopore native SMF experiment to an adata object.
6
-
7
- Parameters:
8
- fasta (str): File path to the reference genome to align to.
9
- output_directory (str): A file path to the directory to output all the analyses.
10
- mod_list (list): A list of strings of the modification types to use in the analysis.
11
- model (str): a string representing the file path to the dorado basecalling model.
12
- thresholds (list): A list of floats to pass for call thresholds.
13
- pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
14
- split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
- barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
- experiment_name (str): A string to provide an experiment name to the output adata file.
18
- bam_suffix (str): A suffix to add to the bam file.
19
- batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
20
-
21
- Returns:
22
- None
23
- """
24
- from .helpers import align_and_sort_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
25
- import os
26
- model_basename = os.path.basename(model)
27
- model_basename = model_basename.replace('.', '_')
28
- mod_string = "_".join(mod_list)
29
- bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
30
- aligned_BAM=f"{bam}_aligned"
31
- aligned_sorted_BAM=f"{aligned_BAM}_sorted"
32
- mod_bed_dir=f"{output_directory}/split_mod_beds"
33
- mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
34
-
35
- make_dirs([mod_bed_dir, mod_tsv_dir])
36
-
37
- aligned_sorted_output = aligned_sorted_BAM + bam_suffix
38
- mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
39
- mods = [mod_map[mod] for mod in mod_list]
40
-
41
- os.chdir(output_directory)
42
-
43
- # 1) Basecall using dorado
44
- modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix)
45
- # 2) Align the BAM to the reference FASTA. Also make an index and a bed file of mapped reads
46
- input_BAM = bam + bam_suffix
47
- align_and_sort_BAM(fasta, input_BAM, bam_suffix, output_directory)
48
- # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
49
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
50
- # 4) Using nanopore modkit to work with modified BAM files ###
51
- modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
52
- make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
53
- extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix) # Extract methylations calls for split BAM files into split TSV files
54
- #5 Load the modification data from TSVs into an adata object
55
- modkit_extract_to_adata(fasta, aligned_sorted_output, mapping_threshold, experiment_name, mods, batch_size)
@@ -1,40 +0,0 @@
1
- ## pod5_to_adata
2
-
3
- def pod5_to_adata(config_path):
4
- """
5
- High-level function to call for converting raw sequencing data to an adata object.
6
-
7
- Parameters:
8
- config_path (str): A string representing the file path to the experiment configuration csv file.
9
-
10
- Returns:
11
- None
12
- """
13
- from .helpers import LoadExperimentConfig, make_dirs
14
- import os
15
- bam_suffix = '.bam' # If different, change from here.
16
- split_dir = 'split_BAMs' # If different, change from here.
17
- strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
18
- conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
19
-
20
- # Load experiment config parameters into global variables
21
- experiment_config = LoadExperimentConfig(config_path)
22
- var_dict = experiment_config.var_dict
23
- for key, value in var_dict.items():
24
- globals()[key] = value
25
-
26
- conversions += conversion_types
27
-
28
- split_path = os.path.join(output_directory, split_dir)
29
- make_dirs([output_directory, split_path])
30
- os.chdir(output_directory)
31
-
32
- if smf_modality == 'conversion':
33
- from .pod5_conversion import pod5_conversion
34
- pod5_conversion(fasta, output_directory, conversions, strands, model, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
35
- elif smf_modality == 'direct':
36
- from .pod5_direct import pod5_direct
37
- thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
38
- pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
39
- else:
40
- print("Error")
@@ -1,64 +0,0 @@
1
- smftools/__init__.py,sha256=zy4ckT7hKrLrlm6NiZQoupvc6oSN7wJsyOBCYdzukcQ,401
2
- smftools/_settings.py,sha256=RkGSrezDzZnh6AODA3o2LiSAINBxxxal5weq-2RZuM0,379
3
- smftools/_version.py,sha256=8oAxKUG747GUokmxjkrWejyJa5yPNEsoJDlXxoedxTw,21
4
- smftools/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
5
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz,sha256=q6wJtgFRDln0o20XNCx1qad3lwcdCoylqPN7wskTfI8,2926497
6
- smftools/datasets/__init__.py,sha256=xkSTlPuakVYVCuRurif9BceNBDt6bsngJvvjI8757QI,142
7
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz,sha256=niOcVHaYY7h3XyvwSkN-V_NMBaRt2vTP5TrJO0CwMCs,8385050
8
- smftools/datasets/datasets.py,sha256=FZ6e7SU6Zt1-gf3az71AgQLpyNvAEiOb5ctbwyP3XSU,553
9
- smftools/informatics/__init__.py,sha256=Aa-QNdfrpZrTPq8xqP4NBA7bb1VLLYaduH-US6pKNA4,280
10
- smftools/informatics/bam_conversion.py,sha256=mLjIVx-07sa3TDQ_qhPKoAfTjqThvyh-MAq8BDtatHg,2719
11
- smftools/informatics/bam_direct.py,sha256=2Y7C8N9QrgJyXqP3WEVZ7fi5qU-iNj_KN0ve-xvXD-U,2964
12
- smftools/informatics/basecalls_to_adata.py,sha256=krnf5fdOuXZmnwW-y_eDC5hEjODUUyMEoY5z6R-65UI,1905
13
- smftools/informatics/fast5_to_pod5.py,sha256=i2sPWME6p6jttG6WbcQFCzI22WVpwcXgZBG9RXqIlzU,717
14
- smftools/informatics/pod5_conversion.py,sha256=x_d55jk2vX5hiSWH6-W4oWdmI-uTce8iRhhBqSzYS_I,3073
15
- smftools/informatics/pod5_direct.py,sha256=aozZQdeYA5AFAEnku6un5gtGtZEdekk-6W7e7K_pBLw,3383
16
- smftools/informatics/pod5_to_adata.py,sha256=1lSr32fAKXkxU5nV3C0UBDW6i2FnSbD_aqlKG4Yn9RU,1763
17
- smftools/informatics/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
18
- smftools/informatics/subsample_pod5.py,sha256=6g8fV2K1YJFEDVHs_bTJDuOSsIkMWpw5Lvk9uittW9k,2188
19
- smftools/informatics/helpers/LoadExperimentConfig.py,sha256=gsWGoa9cydwY4Kd-hTXF2gtmxc8glRRD2V1JB88e9js,2822
20
- smftools/informatics/helpers/__init__.py,sha256=luoFmS4B3XnG68XVziqtmPEZy-X3228LEZzQaHX84Cc,1487
21
- smftools/informatics/helpers/align_and_sort_BAM.py,sha256=835lNQafsdY-PMQZAv3HgLcXNkeduZA4HUewXgGV87M,2447
22
- smftools/informatics/helpers/binarize_converted_base_identities.py,sha256=kAYtPwMY5-gUL2muXOcbSiowQ2Mkls90GXXFaCKG5Pk,1773
23
- smftools/informatics/helpers/canoncall.py,sha256=XaAM_Vd_Q3SQIIlb10-7z5Zo-YADnIcC4eMQjZt1H-E,961
24
- smftools/informatics/helpers/converted_BAM_to_adata.py,sha256=cD-4gnJqCULGexedtETo-NGz-uqq16HwK_9zgCM7w_8,9769
25
- smftools/informatics/helpers/count_aligned_reads.py,sha256=Jttzh94T2uALC-dCvjlq58tupj5VxuKj3j301ICj7eI,2058
26
- smftools/informatics/helpers/extract_base_identities.py,sha256=-iBSEgnsp3EESSOupGcDGObGWykxucD5YC7aW_pEJ2k,2733
27
- smftools/informatics/helpers/extract_mods.py,sha256=UBFjXDKz_A6ivjcocYT1_pKjvygY2Fdg0RjQmMS8UuA,2269
28
- smftools/informatics/helpers/find_conversion_sites.py,sha256=52RvFQkIBly-CqToFe2zaKkpsUSixe8rHxZfL32EygA,3570
29
- smftools/informatics/helpers/generate_converted_FASTA.py,sha256=fKmjpvBQF8fHljzcGtqUgMwdWIfnmwnhI-Y3A59_qDk,3939
30
- smftools/informatics/helpers/get_native_references.py,sha256=fRuyEm9UJkfd5DwHmFb1bxEtNvtSI1_BxGRmrCymGkw,981
31
- smftools/informatics/helpers/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
32
- smftools/informatics/helpers/make_modbed.py,sha256=cOQ97gPfRiCcw_fqboxousXIiOYjp78IFYLbu749U1Y,939
33
- smftools/informatics/helpers/modQC.py,sha256=LeOBObG8gAVVdgESIMceYhd5AW1gfN7ABo91OQtOzTM,1041
34
- smftools/informatics/helpers/modcall.py,sha256=ZcBl3QK622gG1IFy6agOWVADvsjr7WZIcglhSmYl02E,1133
35
- smftools/informatics/helpers/modkit_extract_to_adata.py,sha256=KDnFSHH0PdaANH6uzcVpoii7s5u5gIfkSMgsQCiWORs,25279
36
- smftools/informatics/helpers/one_hot_encode.py,sha256=rNHAfLG8smkpSEDBYj73cVwCNNbFhVfMW4SB2ijydMk,577
37
- smftools/informatics/helpers/separate_bam_by_bc.py,sha256=mGiB6y4xs_WATf-VDyVz4tg6BG3I6ZzYcpuEjeA1oVo,1580
38
- smftools/informatics/helpers/split_and_index_BAM.py,sha256=p2qRV-bOXj1P0JcvBqmK3ihRyM7_bVnKCi3bog_2kR8,1143
39
- smftools/informatics/helpers/archived/informatics.py,sha256=gKb2ZJ_LcAeEXuQqn9e-QDF_sS4tMpMTr2vZlqa7n54,14572
40
- smftools/informatics/helpers/archived/load_adata.py,sha256=DhvYYqO9VLsZqhL1WjN9sd-e3fgvdXGlgTP18z1h0L0,33654
41
- smftools/plotting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- smftools/preprocessing/__init__.py,sha256=PoemcyddUsfRPtJ7Ggr5G69nTXIQCFmz6lctpoEXYUA,1153
43
- smftools/preprocessing/append_C_context.py,sha256=-4P3Dkq60QW57AB9NR6iRjmz-Dl27s2V-HqWWOTB21M,2485
44
- smftools/preprocessing/binarize_on_Youden.py,sha256=slkkt56DZ1FZWy8Un5mNJEZ49JlPnPKow2zU4GoHEr8,2303
45
- smftools/preprocessing/binary_layers_to_ohe.py,sha256=931eHuVda6pMZTvC7jVTKkY2a_KQWpSfgi-nkA5NmaI,1238
46
- smftools/preprocessing/calculate_complexity.py,sha256=nDdEzHaN7KBN7QfqaFgQoTBT_6cpOa44l8IDHwa76fs,3224
47
- smftools/preprocessing/calculate_converted_read_methylation_stats.py,sha256=FCDVLOxT8iHTBeG6EbEFt_S_mIQvYosEAb9-hbc8fhA,2720
48
- smftools/preprocessing/calculate_coverage.py,sha256=46-U7YW98uL3pEVnfhBueGZOnNr3a3rWr3mT7YToglM,2019
49
- smftools/preprocessing/calculate_pairwise_hamming_distances.py,sha256=EIcIOOw0tpECR9hfZeMNlq0J_L2-J-Jp5-RzVkoGO7o,951
50
- smftools/preprocessing/calculate_position_Youden.py,sha256=mfQ6nFfUaEaKg_icyHA1zZlhh0wHjpLE56BZDXOdP_4,6364
51
- smftools/preprocessing/calculate_read_length_stats.py,sha256=wIz-EtOz7S_RDSqAAv7SoiTIQq9Ovkfs4dE4BdHXyNo,1793
52
- smftools/preprocessing/clean_NaN.py,sha256=nnm_zCEGilgDGvxxPBP81sYHJ9LrZ2MVvI1ydf0iPxk,1160
53
- smftools/preprocessing/filter_converted_reads_on_methylation.py,sha256=3BwBWyPnVuOH46J2mgafRXcNzWYRxc4sr7nKTInLyfU,1242
54
- smftools/preprocessing/filter_reads_on_length.py,sha256=LfGz0h3Xf8XUxv3ifXLbfeAOzSYYZVUJcHy73ykUu3A,2135
55
- smftools/preprocessing/invert_adata.py,sha256=6ab7WjUjGUYd2uqXV93k1B4U6laxSwlWSgaR0Lg7apc,655
56
- smftools/preprocessing/mark_duplicates.py,sha256=nKbdUEikJFekWuJ259J4XBPE5elcp44VkyY14jLLZEk,7167
57
- smftools/preprocessing/min_non_diagonal.py,sha256=hx1asW8CEmLaIroZISW8EcAf_RnBEC_nofGD8QG0b1E,711
58
- smftools/preprocessing/remove_duplicates.py,sha256=W0QQ5LLmIGj8OTfXuiTLJjFhOjv5r-TTipTupq-nf50,432
59
- smftools/preprocessing/archives/preprocessing.py,sha256=4mLT09A7vwRZ78FHmuwtv38mH9TQ9qrZc_WjHRhhkIw,34379
60
- smftools/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
- smftools-0.1.1.dist-info/METADATA,sha256=8iWTWzl1ZIZvpYRbp6ov1NizrUJMwBbWNCs31QV3LjY,6262
62
- smftools-0.1.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
63
- smftools-0.1.1.dist-info/licenses/LICENSE,sha256=F8LwmL6vMPddaCt1z1S83Kh_OZv50alTlY7BvVx1RXw,1066
64
- smftools-0.1.1.dist-info/RECORD,,