consenrich 0.7.5b1__cp314-cp314-macosx_10_15_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/__init__.py +11 -0
- consenrich/cconsenrich.c +48693 -0
- consenrich/cconsenrich.cpython-314-darwin.so +0 -0
- consenrich/cconsenrich.pyx +861 -0
- consenrich/consenrich.py +1384 -0
- consenrich/constants.py +172 -0
- consenrich/core.py +1438 -0
- consenrich/data/ce10.sizes +6 -0
- consenrich/data/ce10_blacklist.bed +100 -0
- consenrich/data/ce10_sparse.bed +11828 -0
- consenrich/data/ce11.sizes +6 -0
- consenrich/data/ce11_blacklist.bed +97 -0
- consenrich/data/ce11_sparse.bed +11828 -0
- consenrich/data/dm6.sizes +7 -0
- consenrich/data/dm6_blacklist.bed +182 -0
- consenrich/data/dm6_sparse.bed +20000 -0
- consenrich/data/hg19.sizes +24 -0
- consenrich/data/hg19_blacklist.bed +834 -0
- consenrich/data/hg19_sparse.bed +288358 -0
- consenrich/data/hg38.sizes +24 -0
- consenrich/data/hg38_blacklist.bed +636 -0
- consenrich/data/hg38_sparse.bed +288699 -0
- consenrich/data/mm10.sizes +21 -0
- consenrich/data/mm10_blacklist.bed +3435 -0
- consenrich/data/mm10_sparse.bed +100400 -0
- consenrich/data/mm39.sizes +21 -0
- consenrich/data/mm39_blacklist.bed +3360 -0
- consenrich/data/mm39_sparse.bed +100381 -0
- consenrich/detrorm.py +249 -0
- consenrich/matching.py +907 -0
- consenrich/misc_util.py +122 -0
- consenrich-0.7.5b1.dist-info/METADATA +65 -0
- consenrich-0.7.5b1.dist-info/RECORD +37 -0
- consenrich-0.7.5b1.dist-info/WHEEL +6 -0
- consenrich-0.7.5b1.dist-info/entry_points.txt +2 -0
- consenrich-0.7.5b1.dist-info/licenses/LICENSE +21 -0
- consenrich-0.7.5b1.dist-info/top_level.txt +1 -0
consenrich/constants.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
r"""Various constants and genome resources used in Consenrich."""
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
logging.basicConfig(
|
|
9
|
+
level=logging.INFO,
|
|
10
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
11
|
+
)
|
|
12
|
+
logging.basicConfig(
|
|
13
|
+
level=logging.WARNING,
|
|
14
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
15
|
+
)
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
EFFECTIVE_GENOME_SIZES = {
|
|
19
|
+
"hg19": {
|
|
20
|
+
50: 2685511454,
|
|
21
|
+
75: 2736124898,
|
|
22
|
+
100: 2776919708,
|
|
23
|
+
150: 2827436883,
|
|
24
|
+
200: 2855463800,
|
|
25
|
+
250: 2855044784,
|
|
26
|
+
},
|
|
27
|
+
"hg38": {
|
|
28
|
+
50: 2701495711,
|
|
29
|
+
75: 2747877702,
|
|
30
|
+
100: 2805636231,
|
|
31
|
+
150: 2862010428,
|
|
32
|
+
200: 2887553103,
|
|
33
|
+
250: 2898802627,
|
|
34
|
+
},
|
|
35
|
+
"t2t": {
|
|
36
|
+
50: 2725240337,
|
|
37
|
+
75: 2786136059,
|
|
38
|
+
100: 2814334875,
|
|
39
|
+
150: 2931551487,
|
|
40
|
+
200: 2936403235,
|
|
41
|
+
250: 2960856300,
|
|
42
|
+
},
|
|
43
|
+
"mm10": {
|
|
44
|
+
50: 2308125299,
|
|
45
|
+
75: 2407883243,
|
|
46
|
+
100: 2467481008,
|
|
47
|
+
150: 2494787038,
|
|
48
|
+
200: 2520868989,
|
|
49
|
+
250: 2538590322,
|
|
50
|
+
},
|
|
51
|
+
"mm39": {
|
|
52
|
+
50: 2309746861,
|
|
53
|
+
75: 2410055689,
|
|
54
|
+
100: 2468088461,
|
|
55
|
+
150: 2495461690,
|
|
56
|
+
200: 2521902382,
|
|
57
|
+
250: 2538633971,
|
|
58
|
+
},
|
|
59
|
+
"dm3": {
|
|
60
|
+
50: 130428510,
|
|
61
|
+
75: 135004387,
|
|
62
|
+
100: 139647132,
|
|
63
|
+
150: 144307658,
|
|
64
|
+
200: 148523810,
|
|
65
|
+
250: 151901455,
|
|
66
|
+
},
|
|
67
|
+
"dm6": {
|
|
68
|
+
50: 125464678,
|
|
69
|
+
75: 127324557,
|
|
70
|
+
100: 129789773,
|
|
71
|
+
150: 129940985,
|
|
72
|
+
200: 132508963,
|
|
73
|
+
250: 132900923,
|
|
74
|
+
},
|
|
75
|
+
"ce11": {
|
|
76
|
+
50: 95159402,
|
|
77
|
+
75: 96945370,
|
|
78
|
+
100: 98259898,
|
|
79
|
+
150: 98721103,
|
|
80
|
+
200: 98672558,
|
|
81
|
+
250: 101271756,
|
|
82
|
+
},
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def resolveGenomeName(genome: str) -> str:
|
|
87
|
+
r"""Standardize the genome name for consistency
|
|
88
|
+
:param genome: Name of the genome. See :class:`consenrich.core.genomeParams`.
|
|
89
|
+
:type genome: str
|
|
90
|
+
:return: Standardized genome name.
|
|
91
|
+
:rtype: str
|
|
92
|
+
:raises ValueError: If the genome is not recognized.
|
|
93
|
+
"""
|
|
94
|
+
genome_ = genome.lower()
|
|
95
|
+
if genome_ in ["hg19", "grch37"]:
|
|
96
|
+
return "hg19"
|
|
97
|
+
elif genome_ in ["hg38", "grch38"]:
|
|
98
|
+
return "hg38"
|
|
99
|
+
elif genome_ in ["t2t", "chm13", "t2t-chm13"]:
|
|
100
|
+
return "t2t"
|
|
101
|
+
elif genome_ in ["mm10", "grcm38"]:
|
|
102
|
+
return "mm10"
|
|
103
|
+
elif genome_ in ["mm39", "grcm39"]:
|
|
104
|
+
return "mm39"
|
|
105
|
+
elif genome_ in ["dm3"]:
|
|
106
|
+
return "dm3"
|
|
107
|
+
elif genome_ in ["dm6"]:
|
|
108
|
+
return "dm6"
|
|
109
|
+
elif genome_ in ["ce10", "ws220"]:
|
|
110
|
+
return "ce10"
|
|
111
|
+
elif genome_ in ["ce11", "wbcel235"]:
|
|
112
|
+
return "ce11"
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Genome {genome} is not recognized. Please provide a valid genome name or manually specify resources"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def getEffectiveGenomeSize(genome: str, readLength: int) -> int:
|
|
119
|
+
r"""Get the effective genome size for a given genome and read length.
|
|
120
|
+
|
|
121
|
+
:param genome: Name of the genome. See :func:`consenrich.constants.resolveGenomeName` and :class:`consenrich.core.genomeParams`.
|
|
122
|
+
:type genome: str
|
|
123
|
+
:param readLength: Length of the reads. See :func:`consenrich.core.getReadLength`.
|
|
124
|
+
:type readLength: int
|
|
125
|
+
:raises ValueError: If the genome is not recognized or if the read length is not available for the genome.
|
|
126
|
+
:return: Effective genome size in base pairs.
|
|
127
|
+
:rtype: int
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
global EFFECTIVE_GENOME_SIZES
|
|
131
|
+
genome_: str = resolveGenomeName(genome)
|
|
132
|
+
if genome_ in EFFECTIVE_GENOME_SIZES:
|
|
133
|
+
if readLength not in EFFECTIVE_GENOME_SIZES[genome_]:
|
|
134
|
+
nearestReadLength: int = int(
|
|
135
|
+
min(
|
|
136
|
+
EFFECTIVE_GENOME_SIZES[genome_].keys(),
|
|
137
|
+
key=lambda x: abs(x - readLength),
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
return EFFECTIVE_GENOME_SIZES[genome_][nearestReadLength]
|
|
141
|
+
return EFFECTIVE_GENOME_SIZES[genome_][readLength]
|
|
142
|
+
raise ValueError(f"Defaults not available for {genome}")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def getGenomeResourceFile(
|
|
146
|
+
genome: str, fileType: str, dir_: str = "data"
|
|
147
|
+
):
|
|
148
|
+
r"""Get the path to a genome resource file.
|
|
149
|
+
|
|
150
|
+
:param genome: the genome assembly. See :func:`consenrich.constants.resolveGenomeName` and :class:`consenrich.core.genomeParams`.
|
|
151
|
+
:type genome: str
|
|
152
|
+
:param fileType: One of 'sizes', 'blacklist', 'sparse'.
|
|
153
|
+
:type fileType: str
|
|
154
|
+
:return: Path to the resource file.
|
|
155
|
+
:rtype: str
|
|
156
|
+
:raises ValueError: If not a sizes, blacklist, or sparse file.
|
|
157
|
+
:raises FileNotFoundError: If the resource file does not exist.
|
|
158
|
+
"""
|
|
159
|
+
if fileType.lower() in ["sizes"]:
|
|
160
|
+
fileName = f"{genome}.sizes"
|
|
161
|
+
elif fileType.lower() in ["blacklist"]:
|
|
162
|
+
fileName = f"{genome}_blacklist.bed"
|
|
163
|
+
elif fileType.lower() in ["sparse"]:
|
|
164
|
+
fileName = f"{genome}_sparse.bed"
|
|
165
|
+
filePath = os.path.join(
|
|
166
|
+
os.path.dirname(__file__), os.path.join(dir_, fileName)
|
|
167
|
+
)
|
|
168
|
+
if not os.path.exists(filePath):
|
|
169
|
+
raise FileNotFoundError(
|
|
170
|
+
f"Resource file {filePath} does not exist."
|
|
171
|
+
)
|
|
172
|
+
return filePath
|