consenrich 0.7.5b1__cp314-cp314-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,172 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""Various constants and genome resources used in Consenrich."""
3
+
4
+ import logging
5
+ import os
6
+ from typing import List, Optional
7
+
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
11
+ )
12
+ logging.basicConfig(
13
+ level=logging.WARNING,
14
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+ EFFECTIVE_GENOME_SIZES = {
19
+ "hg19": {
20
+ 50: 2685511454,
21
+ 75: 2736124898,
22
+ 100: 2776919708,
23
+ 150: 2827436883,
24
+ 200: 2855463800,
25
+ 250: 2855044784,
26
+ },
27
+ "hg38": {
28
+ 50: 2701495711,
29
+ 75: 2747877702,
30
+ 100: 2805636231,
31
+ 150: 2862010428,
32
+ 200: 2887553103,
33
+ 250: 2898802627,
34
+ },
35
+ "t2t": {
36
+ 50: 2725240337,
37
+ 75: 2786136059,
38
+ 100: 2814334875,
39
+ 150: 2931551487,
40
+ 200: 2936403235,
41
+ 250: 2960856300,
42
+ },
43
+ "mm10": {
44
+ 50: 2308125299,
45
+ 75: 2407883243,
46
+ 100: 2467481008,
47
+ 150: 2494787038,
48
+ 200: 2520868989,
49
+ 250: 2538590322,
50
+ },
51
+ "mm39": {
52
+ 50: 2309746861,
53
+ 75: 2410055689,
54
+ 100: 2468088461,
55
+ 150: 2495461690,
56
+ 200: 2521902382,
57
+ 250: 2538633971,
58
+ },
59
+ "dm3": {
60
+ 50: 130428510,
61
+ 75: 135004387,
62
+ 100: 139647132,
63
+ 150: 144307658,
64
+ 200: 148523810,
65
+ 250: 151901455,
66
+ },
67
+ "dm6": {
68
+ 50: 125464678,
69
+ 75: 127324557,
70
+ 100: 129789773,
71
+ 150: 129940985,
72
+ 200: 132508963,
73
+ 250: 132900923,
74
+ },
75
+ "ce11": {
76
+ 50: 95159402,
77
+ 75: 96945370,
78
+ 100: 98259898,
79
+ 150: 98721103,
80
+ 200: 98672558,
81
+ 250: 101271756,
82
+ },
83
+ }
84
+
85
+
86
+ def resolveGenomeName(genome: str) -> str:
87
+ r"""Standardize the genome name for consistency
88
+ :param genome: Name of the genome. See :class:`consenrich.core.genomeParams`.
89
+ :type genome: str
90
+ :return: Standardized genome name.
91
+ :rtype: str
92
+ :raises ValueError: If the genome is not recognized.
93
+ """
94
+ genome_ = genome.lower()
95
+ if genome_ in ["hg19", "grch37"]:
96
+ return "hg19"
97
+ elif genome_ in ["hg38", "grch38"]:
98
+ return "hg38"
99
+ elif genome_ in ["t2t", "chm13", "t2t-chm13"]:
100
+ return "t2t"
101
+ elif genome_ in ["mm10", "grcm38"]:
102
+ return "mm10"
103
+ elif genome_ in ["mm39", "grcm39"]:
104
+ return "mm39"
105
+ elif genome_ in ["dm3"]:
106
+ return "dm3"
107
+ elif genome_ in ["dm6"]:
108
+ return "dm6"
109
+ elif genome_ in ["ce10", "ws220"]:
110
+ return "ce10"
111
+ elif genome_ in ["ce11", "wbcel235"]:
112
+ return "ce11"
113
+ raise ValueError(
114
+ f"Genome {genome} is not recognized. Please provide a valid genome name or manually specify resources"
115
+ )
116
+
117
+
118
+ def getEffectiveGenomeSize(genome: str, readLength: int) -> int:
119
+ r"""Get the effective genome size for a given genome and read length.
120
+
121
+ :param genome: Name of the genome. See :func:`consenrich.constants.resolveGenomeName` and :class:`consenrich.core.genomeParams`.
122
+ :type genome: str
123
+ :param readLength: Length of the reads. See :func:`consenrich.core.getReadLength`.
124
+ :type readLength: int
125
+ :raises ValueError: If the genome is not recognized or if the read length is not available for the genome.
126
+ :return: Effective genome size in base pairs.
127
+ :rtype: int
128
+ """
129
+
130
+ global EFFECTIVE_GENOME_SIZES
131
+ genome_: str = resolveGenomeName(genome)
132
+ if genome_ in EFFECTIVE_GENOME_SIZES:
133
+ if readLength not in EFFECTIVE_GENOME_SIZES[genome_]:
134
+ nearestReadLength: int = int(
135
+ min(
136
+ EFFECTIVE_GENOME_SIZES[genome_].keys(),
137
+ key=lambda x: abs(x - readLength),
138
+ )
139
+ )
140
+ return EFFECTIVE_GENOME_SIZES[genome_][nearestReadLength]
141
+ return EFFECTIVE_GENOME_SIZES[genome_][readLength]
142
+ raise ValueError(f"Defaults not available for {genome}")
143
+
144
+
145
+ def getGenomeResourceFile(
146
+ genome: str, fileType: str, dir_: str = "data"
147
+ ):
148
+ r"""Get the path to a genome resource file.
149
+
150
+ :param genome: the genome assembly. See :func:`consenrich.constants.resolveGenomeName` and :class:`consenrich.core.genomeParams`.
151
+ :type genome: str
152
+ :param fileType: One of 'sizes', 'blacklist', 'sparse'.
153
+ :type fileType: str
154
+ :return: Path to the resource file.
155
+ :rtype: str
156
+ :raises ValueError: If not a sizes, blacklist, or sparse file.
157
+ :raises FileNotFoundError: If the resource file does not exist.
158
+ """
159
+ if fileType.lower() in ["sizes"]:
160
+ fileName = f"{genome}.sizes"
161
+ elif fileType.lower() in ["blacklist"]:
162
+ fileName = f"{genome}_blacklist.bed"
163
+ elif fileType.lower() in ["sparse"]:
164
+ fileName = f"{genome}_sparse.bed"
165
+ filePath = os.path.join(
166
+ os.path.dirname(__file__), os.path.join(dir_, fileName)
167
+ )
168
+ if not os.path.exists(filePath):
169
+ raise FileNotFoundError(
170
+ f"Resource file {filePath} does not exist."
171
+ )
172
+ return filePath