coolbox 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of coolbox might be problematic. Click here for more details.
- coolbox/__init__.py +1 -1
- coolbox/cli.py +0 -2
- coolbox/core/browser/base.py +5 -2
- coolbox/core/coverage/__init__.py +1 -1
- coolbox/core/coverage/highlights.py +4 -4
- coolbox/core/frame/frame.py +16 -6
- coolbox/core/track/__init__.py +2 -1
- coolbox/core/track/arcs/plot.py +6 -2
- coolbox/core/track/bed/__init__.py +0 -1
- coolbox/core/track/bed/base.py +93 -85
- coolbox/core/track/bed/bed.py +37 -16
- coolbox/core/track/bed/fetch.py +1 -1
- coolbox/core/track/bed/plot.py +71 -221
- coolbox/core/track/gtf.py +11 -9
- coolbox/core/track/hicmat/base.py +12 -9
- coolbox/core/track/hicmat/cool.py +6 -5
- coolbox/core/track/hicmat/dothic.py +4 -3
- coolbox/core/track/hicmat/hicmat.py +8 -9
- coolbox/core/track/hicmat/plot.py +12 -6
- coolbox/core/track/hist/__init__.py +10 -3
- coolbox/core/track/hist/bigwig.py +0 -16
- coolbox/core/track/hist/plot.py +13 -5
- coolbox/core/track/ideogram.py +19 -10
- coolbox/core/track/pseudo.py +6 -2
- coolbox/core/track/tad.py +237 -0
- coolbox/utilities/bed.py +1 -1
- coolbox/utilities/hic/straw.py +532 -329
- coolbox/utilities/hic/wrap.py +55 -24
- {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/METADATA +20 -11
- {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/RECORD +34 -34
- {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/WHEEL +1 -1
- coolbox/core/track/bed/tad.py +0 -18
- {coolbox-0.3.7.data → coolbox-0.3.9.data}/scripts/coolbox +0 -0
- {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/LICENSE +0 -0
- {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/top_level.txt +0 -0
coolbox/utilities/hic/straw.py
CHANGED
|
@@ -1,48 +1,36 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Module for load '.hic' data, from:
|
|
3
|
-
|
|
4
|
-
https://github.com/aidenlab/straw/blob/master/python/straw.py
|
|
5
|
-
|
|
6
|
-
------
|
|
7
|
-
|
|
8
2
|
Straw module
|
|
9
3
|
|
|
10
4
|
Straw enables programmatic access to .hic files.
|
|
11
5
|
.hic files store the contact matrices from Hi-C experiments and the
|
|
12
6
|
normalization and expected vectors, along with meta-data in the header.
|
|
13
7
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
It then reads the header, follows the various pointers to the desired matrix
|
|
19
|
-
and normalization vector, and stores as [x, y, count]
|
|
20
|
-
|
|
21
|
-
Usage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <\
|
|
22
|
-
BP/FRAG> <binsize>
|
|
8
|
+
Usage: strawObj = straw <hicFile(s)>
|
|
9
|
+
matrixObj = strawObj.getNormalizedMatrix <chr1> <chr2> <NONE/VC/VC_SQRT/KR> <BP/FRAG> <binsize>
|
|
10
|
+
data = matrixObj.getDataFromBinRegion <x1,x2,y1,y2>
|
|
23
11
|
|
|
24
12
|
Example:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
import straw
|
|
14
|
+
strawObj = straw(filename)
|
|
15
|
+
matrixObj = strawObj.getNormalizedMatrix('5', '5', 'KR', 'BP', 5000)
|
|
16
|
+
result = matrixObj.getDataFromBinRegion(0,500,0,500)
|
|
17
|
+
for i in range(len(result[0])):
|
|
28
18
|
... print("{0}\t{1}\t{2}".format(result[0][i], result[1][i], result[2][i]))
|
|
29
19
|
|
|
30
20
|
See https://github.com/theaidenlab/straw/wiki/Python for more documentation
|
|
31
21
|
"""
|
|
32
|
-
|
|
33
22
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
34
23
|
|
|
35
|
-
__author__ = "Yue Wu
|
|
24
|
+
__author__ = "Yue Wu, Neva Durand, Yossi Eliaz, Muhammad Shamim, Erez Aiden"
|
|
36
25
|
__license__ = "MIT"
|
|
37
26
|
|
|
38
|
-
import sys
|
|
39
27
|
import struct
|
|
40
28
|
import zlib
|
|
29
|
+
import requests
|
|
41
30
|
import io
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
version = 0
|
|
31
|
+
import concurrent.futures
|
|
32
|
+
import math
|
|
33
|
+
import sys
|
|
46
34
|
|
|
47
35
|
|
|
48
36
|
def __readcstr(f):
|
|
@@ -62,69 +50,197 @@ def __readcstr(f):
|
|
|
62
50
|
readcstr = __readcstr
|
|
63
51
|
|
|
64
52
|
|
|
65
|
-
|
|
53
|
+
"""
|
|
54
|
+
functions for chrom.sizes
|
|
55
|
+
internal representation is a dictionary with
|
|
56
|
+
chromosome name as the key
|
|
57
|
+
value maps to a tuple containing the index and chromosome length
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ChromDotSizes:
|
|
62
|
+
def __init__(self, data):
|
|
63
|
+
self.data = data
|
|
64
|
+
|
|
65
|
+
def getLength(self, chrom):
|
|
66
|
+
try:
|
|
67
|
+
return int(self.data[chrom][1])
|
|
68
|
+
except:
|
|
69
|
+
print(str(chrom) + " not in chrom.sizes. Check that the chromosome name matches the genome.\n")
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def getIndex(self, chrom):
|
|
73
|
+
try:
|
|
74
|
+
return int(self.data[chrom][0])
|
|
75
|
+
except:
|
|
76
|
+
print(str(chrom) + " not in chrom.sizes. Check that the chromosome name matches the genome.\n")
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
def figureOutEndpoints(self, chrAndPositions):
|
|
80
|
+
chrAndPositionsArray = chrAndPositions.split(":")
|
|
81
|
+
chrom = chrAndPositionsArray[0]
|
|
82
|
+
|
|
83
|
+
indx1 = 0
|
|
84
|
+
indx2 = self.getLength(chrom)
|
|
85
|
+
|
|
86
|
+
if len(chrAndPositionsArray) == 3:
|
|
87
|
+
indx1 = int(chrAndPositionsArray[1])
|
|
88
|
+
indx2 = int(chrAndPositionsArray[2])
|
|
89
|
+
|
|
90
|
+
return chrom, indx1, indx2
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def read_metadata(infile, verbose=False):
|
|
94
|
+
"""
|
|
95
|
+
Reads the metadata of HiC file from header.
|
|
96
|
+
|
|
97
|
+
Args
|
|
98
|
+
infile: str, path to the HiC file
|
|
99
|
+
verbose: bool
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
metadata: dict, containing the metadata.
|
|
103
|
+
Keys of the metadata:
|
|
104
|
+
HiC version,
|
|
105
|
+
Master index,
|
|
106
|
+
Genome ID (str),
|
|
107
|
+
Attribute dictionary (dict),
|
|
108
|
+
Chromosomes (dict),
|
|
109
|
+
Base pair-delimited resolutions (list),
|
|
110
|
+
Fragment-delimited resolutions (list).
|
|
111
|
+
"""
|
|
112
|
+
metadata = {}
|
|
113
|
+
import io
|
|
114
|
+
import struct
|
|
115
|
+
if (infile.startswith("http")):
|
|
116
|
+
# try URL first. 100K should be sufficient for header
|
|
117
|
+
headers = {'range': 'bytes=0-100000', 'x-amz-meta-requester': 'straw'}
|
|
118
|
+
s = requests.Session()
|
|
119
|
+
r = s.get(infile, headers=headers)
|
|
120
|
+
if (r.status_code >= 400):
|
|
121
|
+
print("Error accessing " + infile)
|
|
122
|
+
print("HTTP status code " + str(r.status_code))
|
|
123
|
+
sys.exit(1)
|
|
124
|
+
req = io.BytesIO(r.content)
|
|
125
|
+
myrange = r.headers['content-range'].split('/')
|
|
126
|
+
totalbytes = myrange[1]
|
|
127
|
+
else:
|
|
128
|
+
req = open(infile, 'rb')
|
|
129
|
+
magic_string = struct.unpack('<3s', req.read(3))[0]
|
|
130
|
+
req.read(1)
|
|
131
|
+
if (magic_string != b"HIC"):
|
|
132
|
+
sys.exit('This does not appear to be a HiC file magic string is incorrect')
|
|
133
|
+
version = struct.unpack('<i', req.read(4))[0]
|
|
134
|
+
metadata['HiC version'] = version
|
|
135
|
+
masterindex = struct.unpack('<q', req.read(8))[0]
|
|
136
|
+
metadata['Master index'] = masterindex
|
|
137
|
+
genome = ""
|
|
138
|
+
c = req.read(1).decode("utf-8")
|
|
139
|
+
while (c != '\0'):
|
|
140
|
+
genome += c
|
|
141
|
+
c = req.read(1).decode("utf-8")
|
|
142
|
+
metadata['Genome ID'] = genome
|
|
143
|
+
if (version > 8):
|
|
144
|
+
nvi = struct.unpack('<q', req.read(8))[0]
|
|
145
|
+
nvisize = struct.unpack('<q', req.read(8))[0]
|
|
146
|
+
metadata['NVI'] = nvi
|
|
147
|
+
metadata['NVI size'] = nvisize
|
|
148
|
+
## read and throw away attribute dictionary (stats+graphs)
|
|
149
|
+
nattributes = struct.unpack('<i', req.read(4))[0]
|
|
150
|
+
d = {}
|
|
151
|
+
for x in range(0, nattributes):
|
|
152
|
+
key = __readcstr(req)
|
|
153
|
+
value = __readcstr(req)
|
|
154
|
+
d[key] = value
|
|
155
|
+
metadata['Attribute dictionary'] = d
|
|
156
|
+
nChrs = struct.unpack('<i', req.read(4))[0]
|
|
157
|
+
d = {}
|
|
158
|
+
for x in range(0, nChrs):
|
|
159
|
+
key = __readcstr(req)
|
|
160
|
+
if (version > 8):
|
|
161
|
+
value = struct.unpack('q', req.read(8))[0]
|
|
162
|
+
else:
|
|
163
|
+
value = struct.unpack('<i', req.read(4))[0]
|
|
164
|
+
d[key] = value
|
|
165
|
+
metadata["Chromosomes"] = d
|
|
166
|
+
nBpRes = struct.unpack('<i', req.read(4))[0]
|
|
167
|
+
l = []
|
|
168
|
+
for x in range(0, nBpRes):
|
|
169
|
+
res = struct.unpack('<i', req.read(4))[0]
|
|
170
|
+
l.append(res)
|
|
171
|
+
metadata["Base pair-delimited resolutions"] = l
|
|
172
|
+
nFrag = struct.unpack('<i', req.read(4))[0]
|
|
173
|
+
l = []
|
|
174
|
+
for x in range(0, nFrag):
|
|
175
|
+
res = struct.unpack('<i', req.read(4))[0]
|
|
176
|
+
l.append(res)
|
|
177
|
+
metadata["Fragment-delimited resolutions"] = l
|
|
178
|
+
for k in metadata:
|
|
179
|
+
if k != 'Attribute dictionary':
|
|
180
|
+
print(k, ':', metadata[k])
|
|
181
|
+
if verbose:
|
|
182
|
+
print('Attribute dictionary', ':', metadata['Attribute dictionary'])
|
|
183
|
+
return metadata
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def readHeader(infile, is_synapse):
|
|
66
187
|
""" Reads the header
|
|
67
188
|
|
|
68
189
|
Args:
|
|
69
|
-
|
|
70
|
-
chr1 (str): Chromosome 1
|
|
71
|
-
chr2 (str): Chromosome 2
|
|
72
|
-
c1pos1 (int, optional): Starting range of chromosome1 output
|
|
73
|
-
c1pos2 (int, optional): Stopping range of chromosome1 output
|
|
74
|
-
c2pos1 (int, optional): Starting range of chromosome2 output
|
|
75
|
-
c2pos2 (int, optional): Stopping range of chromosome2 output
|
|
190
|
+
input file, is_synapse
|
|
76
191
|
|
|
77
192
|
Returns:
|
|
78
|
-
list: master index,
|
|
193
|
+
list: master index, version number, size of totalbytes, chromDotSizes
|
|
79
194
|
"""
|
|
195
|
+
|
|
196
|
+
if infile.startswith("http"):
|
|
197
|
+
# try URL first. 100K should be sufficient for header
|
|
198
|
+
headers = getHttpHeader('bytes=0-100000', is_synapse)
|
|
199
|
+
s = requests.Session()
|
|
200
|
+
r = s.get(infile, headers=headers)
|
|
201
|
+
if r.status_code >= 400:
|
|
202
|
+
print("Error accessing " + infile)
|
|
203
|
+
print("HTTP status code " + str(r.status_code))
|
|
204
|
+
return -1
|
|
205
|
+
req = io.BytesIO(r.content)
|
|
206
|
+
myrange = r.headers['content-range'].split('/')
|
|
207
|
+
totalbytes = myrange[1]
|
|
208
|
+
else:
|
|
209
|
+
req = open(infile, 'rb')
|
|
210
|
+
totalbytes = None
|
|
211
|
+
|
|
80
212
|
magic_string = struct.unpack('<3s', req.read(3))[0]
|
|
81
213
|
req.read(1)
|
|
82
|
-
if
|
|
214
|
+
if magic_string != b"HIC":
|
|
83
215
|
print('This does not appear to be a HiC file magic string is incorrect')
|
|
84
216
|
return -1
|
|
85
|
-
global version
|
|
86
217
|
version = struct.unpack('<i', req.read(4))[0]
|
|
87
|
-
if
|
|
218
|
+
if version < 6:
|
|
88
219
|
print("Version {0} no longer supported".format(str(version)))
|
|
89
220
|
return -1
|
|
90
|
-
#
|
|
221
|
+
#print('HiC version:' + ' {0}'.format(str(version)))
|
|
91
222
|
master = struct.unpack('<q', req.read(8))[0]
|
|
92
223
|
genome = b""
|
|
93
224
|
c = req.read(1)
|
|
94
|
-
while
|
|
225
|
+
while c != b'\0':
|
|
95
226
|
genome += c
|
|
96
227
|
c = req.read(1)
|
|
97
228
|
|
|
98
229
|
# read and throw away attribute dictionary (stats+graphs)
|
|
99
230
|
nattributes = struct.unpack('<i', req.read(4))[0]
|
|
100
|
-
for
|
|
231
|
+
for x in range(nattributes):
|
|
101
232
|
key = __readcstr(req)
|
|
102
233
|
value = __readcstr(req)
|
|
103
234
|
nChrs = struct.unpack('<i', req.read(4))[0]
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
for i in range(nChrs):
|
|
235
|
+
chromDotSizes = {}
|
|
236
|
+
for i in range(0, nChrs):
|
|
107
237
|
name = __readcstr(req)
|
|
108
238
|
length = struct.unpack('<i', req.read(4))[0]
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
chr1ind = i
|
|
112
|
-
if (posilist[0] == -100):
|
|
113
|
-
posilist[0] = 0
|
|
114
|
-
posilist[1] = length
|
|
115
|
-
if (name == chr2):
|
|
116
|
-
found2 = True
|
|
117
|
-
chr2ind = i
|
|
118
|
-
if (posilist[2] == -100):
|
|
119
|
-
posilist[2] = 0
|
|
120
|
-
posilist[3] = length
|
|
121
|
-
if ((not found1) or (not found2)):
|
|
122
|
-
print("One of the chromosomes wasn't found in the file. Check that the chromosome name matches the genome.\n")
|
|
123
|
-
return -1
|
|
124
|
-
return [master, chr1ind, chr2ind, posilist[0], posilist[1], posilist[2], posilist[3]]
|
|
239
|
+
chromDotSizes[name] = (i, length)
|
|
240
|
+
return master, version, totalbytes, ChromDotSizes(chromDotSizes)
|
|
125
241
|
|
|
126
242
|
|
|
127
|
-
def readFooter(
|
|
243
|
+
def readFooter(infile, is_synapse, master, totalbytes):
|
|
128
244
|
"""Reads the footer, which contains all the expected and normalization
|
|
129
245
|
vectors. Presumes file pointer is in correct position
|
|
130
246
|
Args:
|
|
@@ -140,73 +256,77 @@ def readFooter(req, c1, c2, norm, unit, resolution):
|
|
|
140
256
|
list: File position of matrix, position+size chr1 normalization vector,
|
|
141
257
|
position+size chr2 normalization vector
|
|
142
258
|
"""
|
|
143
|
-
|
|
144
|
-
|
|
259
|
+
if infile.startswith("http"):
|
|
260
|
+
headers = getHttpHeader('bytes={0}-{1}'.format(master, totalbytes), is_synapse)
|
|
261
|
+
s = requests.Session()
|
|
262
|
+
r = s.get(infile, headers=headers)
|
|
263
|
+
req = io.BytesIO(r.content)
|
|
264
|
+
else:
|
|
265
|
+
req = open(infile, 'rb')
|
|
266
|
+
req.seek(master)
|
|
267
|
+
|
|
268
|
+
filePositions = dict()
|
|
145
269
|
nBytes = struct.unpack('<i', req.read(4))[0]
|
|
146
|
-
key = str(c1) + "_" + str(c2)
|
|
147
270
|
nEntries = struct.unpack('<i', req.read(4))[0]
|
|
148
|
-
|
|
149
|
-
for
|
|
150
|
-
|
|
271
|
+
|
|
272
|
+
for i in range(nEntries):
|
|
273
|
+
key = __readcstr(req)
|
|
151
274
|
fpos = struct.unpack('<q', req.read(8))[0]
|
|
152
275
|
sizeinbytes = struct.unpack('<i', req.read(4))[0]
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
if (not found):
|
|
157
|
-
print("File doesn't have the given chr_chr map\n")
|
|
158
|
-
if (norm == "NONE"):
|
|
159
|
-
return [myFilePos, 0, 0]
|
|
276
|
+
filePositions[key] = (fpos, sizeinbytes)
|
|
277
|
+
|
|
278
|
+
# later save these
|
|
160
279
|
nExpectedValues = struct.unpack('<i', req.read(4))[0]
|
|
161
|
-
for
|
|
162
|
-
|
|
280
|
+
for i in range(nExpectedValues):
|
|
281
|
+
key = __readcstr(req)
|
|
163
282
|
binSize = struct.unpack('<i', req.read(4))[0]
|
|
164
283
|
nValues = struct.unpack('<i', req.read(4))[0]
|
|
165
|
-
for
|
|
284
|
+
for j in range(nValues):
|
|
285
|
+
# replace with vector.append
|
|
166
286
|
v = struct.unpack('<d', req.read(8))[0]
|
|
167
287
|
nNormalizationFactors = struct.unpack('<i', req.read(4))[0]
|
|
168
|
-
for
|
|
288
|
+
for j in range(nNormalizationFactors):
|
|
289
|
+
# replace with vector.append
|
|
169
290
|
chrIdx = struct.unpack('<i', req.read(4))[0]
|
|
170
291
|
v = struct.unpack('<d', req.read(8))[0]
|
|
171
292
|
nExpectedValues = struct.unpack('<i', req.read(4))[0]
|
|
172
|
-
for
|
|
293
|
+
for i in range(nExpectedValues):
|
|
173
294
|
str_ = __readcstr(req)
|
|
174
295
|
str_ = __readcstr(req)
|
|
175
296
|
binSize = struct.unpack('<i', req.read(4))[0]
|
|
176
297
|
nValues = struct.unpack('<i', req.read(4))[0]
|
|
177
|
-
for
|
|
298
|
+
for j in range(nValues):
|
|
178
299
|
v = struct.unpack('<d', req.read(8))[0]
|
|
179
300
|
nNormalizationFactors = struct.unpack('<i', req.read(4))[0]
|
|
180
|
-
for
|
|
301
|
+
for j in range(nNormalizationFactors):
|
|
181
302
|
chrIdx = struct.unpack('<i', req.read(4))[0]
|
|
182
303
|
v = struct.unpack('<d', req.read(8))[0]
|
|
304
|
+
|
|
305
|
+
normMap = dict()
|
|
183
306
|
nEntries = struct.unpack('<i', req.read(4))[0]
|
|
184
|
-
|
|
185
|
-
found2 = False
|
|
186
|
-
for _ in range(nEntries):
|
|
307
|
+
for i in range(nEntries):
|
|
187
308
|
normtype = __readcstr(req)
|
|
309
|
+
if normtype not in normMap:
|
|
310
|
+
normMap[normtype] = {}
|
|
188
311
|
chrIdx = struct.unpack('<i', req.read(4))[0]
|
|
189
|
-
|
|
190
|
-
|
|
312
|
+
if chrIdx not in normMap[normtype]:
|
|
313
|
+
normMap[normtype][chrIdx] = {}
|
|
314
|
+
unit = __readcstr(req)
|
|
315
|
+
if unit not in normMap[normtype][chrIdx]:
|
|
316
|
+
normMap[normtype][chrIdx][unit] = {}
|
|
317
|
+
resolution = struct.unpack('<i', req.read(4))[0]
|
|
318
|
+
if resolution not in normMap[normtype][chrIdx][unit]:
|
|
319
|
+
normMap[normtype][chrIdx][unit][resolution] = {}
|
|
191
320
|
filePosition = struct.unpack('<q', req.read(8))[0]
|
|
192
321
|
sizeInBytes = struct.unpack('<i', req.read(4))[0]
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
c2NormEntry['position'] = filePosition
|
|
199
|
-
c2NormEntry['size'] = sizeInBytes
|
|
200
|
-
found2 = True
|
|
201
|
-
if ((not found1) or (not found2)):
|
|
202
|
-
print("File did not contain {0} normalization vectors for one or both chromosomes at {1} {2}\n".format(norm,
|
|
203
|
-
resolution,
|
|
204
|
-
unit))
|
|
205
|
-
return -1
|
|
206
|
-
return [myFilePos, c1NormEntry, c2NormEntry]
|
|
322
|
+
|
|
323
|
+
normMap[normtype][chrIdx][unit][resolution]['position'] = filePosition
|
|
324
|
+
normMap[normtype][chrIdx][unit][resolution]['size'] = sizeInBytes
|
|
325
|
+
|
|
326
|
+
return filePositions, normMap
|
|
207
327
|
|
|
208
328
|
|
|
209
|
-
def readMatrixZoomData(req, myunit, mybinsize):
|
|
329
|
+
def readMatrixZoomData(req, myunit, mybinsize, blockMap):
|
|
210
330
|
""" Reads the Matrix Zoom Data, which gives pointer list for blocks for
|
|
211
331
|
the data. Presumes file pointer is in correct position
|
|
212
332
|
|
|
@@ -222,10 +342,10 @@ def readMatrixZoomData(req, myunit, mybinsize):
|
|
|
222
342
|
"""
|
|
223
343
|
unit = __readcstr(req)
|
|
224
344
|
temp = struct.unpack('<i', req.read(4))[0]
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
345
|
+
temp = struct.unpack('<f', req.read(4))[0]
|
|
346
|
+
temp = struct.unpack('<f', req.read(4))[0]
|
|
347
|
+
temp = struct.unpack('<f', req.read(4))[0]
|
|
348
|
+
temp = struct.unpack('<f', req.read(4))[0]
|
|
229
349
|
binSize = struct.unpack('<i', req.read(4))[0]
|
|
230
350
|
blockBinCount = struct.unpack('<i', req.read(4))[0]
|
|
231
351
|
blockColumnCount = struct.unpack('<i', req.read(4))[0]
|
|
@@ -233,22 +353,24 @@ def readMatrixZoomData(req, myunit, mybinsize):
|
|
|
233
353
|
# for the initial
|
|
234
354
|
myBlockBinCount = -1
|
|
235
355
|
myBlockColumnCount = -1
|
|
236
|
-
if
|
|
356
|
+
if myunit == unit and mybinsize == binSize:
|
|
237
357
|
myBlockBinCount = blockBinCount
|
|
238
358
|
myBlockColumnCount = blockColumnCount
|
|
239
359
|
storeBlockData = True
|
|
240
360
|
nBlocks = struct.unpack('<i', req.read(4))[0]
|
|
241
|
-
for
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
361
|
+
for b in range(nBlocks):
|
|
362
|
+
blockNumber = struct.unpack('<i', req.read(4))[0]
|
|
363
|
+
filePosition = struct.unpack('<q', req.read(8))[0]
|
|
364
|
+
blockSizeInBytes = struct.unpack('<i', req.read(4))[0]
|
|
365
|
+
entry = dict()
|
|
366
|
+
entry['size'] = blockSizeInBytes
|
|
367
|
+
entry['position'] = filePosition
|
|
368
|
+
if storeBlockData:
|
|
247
369
|
blockMap[blockNumber] = entry
|
|
248
|
-
return
|
|
370
|
+
return storeBlockData, myBlockBinCount, myBlockColumnCount
|
|
249
371
|
|
|
250
372
|
|
|
251
|
-
def readMatrix(req, unit, binsize):
|
|
373
|
+
def readMatrix(req, unit, binsize, blockMap):
|
|
252
374
|
""" Reads the matrix - that is, finds the appropriate pointers to block
|
|
253
375
|
data and stores them. Needs to read through headers of zoom data to find
|
|
254
376
|
appropriate matrix. Presumes file pointer is in correct position.
|
|
@@ -261,6 +383,9 @@ def readMatrix(req, unit, binsize):
|
|
|
261
383
|
|
|
262
384
|
Returns:
|
|
263
385
|
list containing block bin count and block column count of matrix
|
|
386
|
+
|
|
387
|
+
Raises:
|
|
388
|
+
ValueError if the .hic file can't be parsed with the specified resolution (binsize)
|
|
264
389
|
"""
|
|
265
390
|
c1 = struct.unpack('<i', req.read(4))[0]
|
|
266
391
|
c2 = struct.unpack('<i', req.read(4))[0]
|
|
@@ -269,17 +394,13 @@ def readMatrix(req, unit, binsize):
|
|
|
269
394
|
found = False
|
|
270
395
|
blockBinCount = -1
|
|
271
396
|
blockColumnCount = -1
|
|
272
|
-
while
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
if (not found):
|
|
280
|
-
print("Error finding block data\n")
|
|
281
|
-
return -1
|
|
282
|
-
return [blockBinCount, blockColumnCount]
|
|
397
|
+
while i < nRes and (not found):
|
|
398
|
+
found, blockBinCount, blockColumnCount = readMatrixZoomData(req, unit, binsize, blockMap)
|
|
399
|
+
i = i + 1
|
|
400
|
+
if not found:
|
|
401
|
+
raise ValueError(f"Error: could not parse .hic file using specified resolution/bin-size ({binsize})")
|
|
402
|
+
|
|
403
|
+
return blockBinCount, blockColumnCount
|
|
283
404
|
|
|
284
405
|
|
|
285
406
|
def getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, intra):
|
|
@@ -300,21 +421,21 @@ def getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockC
|
|
|
300
421
|
row1 = int(regionIndices[2] / blockBinCount)
|
|
301
422
|
row2 = int((regionIndices[3] + 1) / blockBinCount)
|
|
302
423
|
blocksSet = set()
|
|
303
|
-
|
|
424
|
+
|
|
304
425
|
for r in range(row1, row2 + 1):
|
|
305
426
|
for c in range(col1, col2 + 1):
|
|
306
427
|
blockNumber = r * blockColumnCount + c
|
|
307
428
|
blocksSet.add(blockNumber)
|
|
308
|
-
if
|
|
429
|
+
# in Java code, this is "if getBelowDiagonal"
|
|
430
|
+
if intra and col2 > row1:
|
|
309
431
|
for r in range(col1, col2 + 1):
|
|
310
432
|
for c in range(row1, row2 + 1):
|
|
311
433
|
blockNumber = r * blockColumnCount + c
|
|
312
434
|
blocksSet.add(blockNumber)
|
|
313
|
-
# print(str(blocksSet))
|
|
314
435
|
return blocksSet
|
|
315
436
|
|
|
316
437
|
|
|
317
|
-
def readBlock(req, size):
|
|
438
|
+
def readBlock(req, size, version):
|
|
318
439
|
""" Reads the block - reads the compressed bytes, decompresses, and stores
|
|
319
440
|
results in array. Presumes file pointer is in correct position.
|
|
320
441
|
|
|
@@ -330,13 +451,15 @@ def readBlock(req, size):
|
|
|
330
451
|
uncompressedBytes = zlib.decompress(compressedBytes)
|
|
331
452
|
nRecords = struct.unpack('<i', uncompressedBytes[0:4])[0]
|
|
332
453
|
v = []
|
|
333
|
-
|
|
334
|
-
if (version < 7):
|
|
454
|
+
if version < 7:
|
|
335
455
|
for i in range(nRecords):
|
|
336
456
|
binX = struct.unpack('<i', uncompressedBytes[(12 * i + 4):(12 * i + 8)])[0]
|
|
337
457
|
binY = struct.unpack('<i', uncompressedBytes[(12 * i + 8):(12 * i + 12)])[0]
|
|
338
458
|
counts = struct.unpack('<f', uncompressedBytes[(12 * i + 12):(12 * i + 16)])[0]
|
|
339
|
-
record =
|
|
459
|
+
record = dict()
|
|
460
|
+
record['binX'] = binX
|
|
461
|
+
record['binY'] = binY
|
|
462
|
+
record['counts'] = counts
|
|
340
463
|
v.append(record)
|
|
341
464
|
else:
|
|
342
465
|
binXOffset = struct.unpack('<i', uncompressedBytes[4:8])[0]
|
|
@@ -344,57 +467,125 @@ def readBlock(req, size):
|
|
|
344
467
|
useShort = struct.unpack('<b', uncompressedBytes[12:13])[0]
|
|
345
468
|
type_ = struct.unpack('<b', uncompressedBytes[13:14])[0]
|
|
346
469
|
index = 0
|
|
347
|
-
if
|
|
470
|
+
if type_ == 1:
|
|
348
471
|
rowCount = struct.unpack('<h', uncompressedBytes[14:16])[0]
|
|
349
472
|
temp = 16
|
|
350
|
-
for
|
|
473
|
+
for i in range(rowCount):
|
|
351
474
|
y = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
|
|
352
|
-
temp
|
|
475
|
+
temp = temp + 2
|
|
353
476
|
binY = y + binYOffset
|
|
354
477
|
colCount = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
|
|
355
|
-
temp
|
|
356
|
-
for
|
|
478
|
+
temp = temp + 2
|
|
479
|
+
for j in range(colCount):
|
|
357
480
|
x = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
|
|
358
|
-
temp
|
|
481
|
+
temp = temp + 2
|
|
359
482
|
binX = binXOffset + x
|
|
360
|
-
if
|
|
483
|
+
if useShort == 0:
|
|
361
484
|
c = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
|
|
362
|
-
temp
|
|
485
|
+
temp = temp + 2
|
|
363
486
|
counts = c
|
|
364
487
|
else:
|
|
365
488
|
counts = struct.unpack('<f', uncompressedBytes[temp:(temp + 4)])[0]
|
|
366
|
-
temp
|
|
367
|
-
record =
|
|
489
|
+
temp = temp + 4
|
|
490
|
+
record = dict()
|
|
491
|
+
record['binX'] = binX
|
|
492
|
+
record['binY'] = binY
|
|
493
|
+
record['counts'] = counts
|
|
368
494
|
v.append(record)
|
|
369
|
-
index
|
|
495
|
+
index = index + 1
|
|
370
496
|
elif type_ == 2:
|
|
371
497
|
temp = 14
|
|
372
498
|
nPts = struct.unpack('<i', uncompressedBytes[temp:(temp + 4)])[0]
|
|
373
|
-
temp
|
|
499
|
+
temp = temp + 4
|
|
374
500
|
w = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
|
|
375
|
-
temp
|
|
501
|
+
temp = temp + 2
|
|
376
502
|
for i in range(nPts):
|
|
377
503
|
row = int(i / w)
|
|
378
504
|
col = i - row * w
|
|
379
505
|
bin1 = int(binXOffset + col)
|
|
380
506
|
bin2 = int(binYOffset + row)
|
|
381
|
-
if
|
|
507
|
+
if useShort == 0:
|
|
382
508
|
c = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
|
|
383
|
-
temp
|
|
384
|
-
if
|
|
385
|
-
record =
|
|
509
|
+
temp = temp + 2
|
|
510
|
+
if c != -32768:
|
|
511
|
+
record = dict()
|
|
512
|
+
record['binX'] = bin1
|
|
513
|
+
record['binY'] = bin2
|
|
514
|
+
record['counts'] = c
|
|
386
515
|
v.append(record)
|
|
387
|
-
index
|
|
516
|
+
index = index + 1
|
|
388
517
|
else:
|
|
389
518
|
counts = struct.unpack('<f', uncompressedBytes[temp:(temp + 4)])[0]
|
|
390
|
-
temp
|
|
391
|
-
if
|
|
392
|
-
record =
|
|
519
|
+
temp = temp + 4
|
|
520
|
+
if counts != 0x7fc00000:
|
|
521
|
+
record = dict()
|
|
522
|
+
record['binX'] = bin1
|
|
523
|
+
record['binY'] = bin2
|
|
524
|
+
record['counts'] = counts
|
|
393
525
|
v.append(record)
|
|
394
|
-
index
|
|
526
|
+
index = index + 1
|
|
395
527
|
return v
|
|
396
528
|
|
|
397
529
|
|
|
530
|
+
def readBlockWorker(infile, is_synapse, blockNum, binsize, blockMap, norm, c1Norm, c2Norm, binPositionBox, isIntra,
|
|
531
|
+
version):
|
|
532
|
+
yActual = []
|
|
533
|
+
xActual = []
|
|
534
|
+
counts = []
|
|
535
|
+
idx = dict()
|
|
536
|
+
if blockNum in blockMap:
|
|
537
|
+
idx = blockMap[blockNum]
|
|
538
|
+
else:
|
|
539
|
+
idx['size'] = 0
|
|
540
|
+
idx['position'] = 0
|
|
541
|
+
|
|
542
|
+
if idx['size'] == 0:
|
|
543
|
+
records = []
|
|
544
|
+
else:
|
|
545
|
+
if infile.startswith("http"):
|
|
546
|
+
headers = getHttpHeader('bytes={0}-{1}'.format(idx['position'], idx['position'] + idx['size']), is_synapse)
|
|
547
|
+
s = requests.Session()
|
|
548
|
+
r = s.get(infile, headers=headers);
|
|
549
|
+
req = io.BytesIO(r.content)
|
|
550
|
+
else:
|
|
551
|
+
req = open(infile, 'rb')
|
|
552
|
+
req.seek(idx['position'])
|
|
553
|
+
records = readBlock(req, idx['size'], version)
|
|
554
|
+
|
|
555
|
+
# No caching currently; in Java code we keep all records and check positions later
|
|
556
|
+
if norm != "NONE":
|
|
557
|
+
for record in records:
|
|
558
|
+
binX = record['binX']
|
|
559
|
+
binY = record['binY']
|
|
560
|
+
|
|
561
|
+
if ((binPositionBox[0] <= binX <= binPositionBox[1] and binPositionBox[2] <= binY <=
|
|
562
|
+
binPositionBox[3]) or (
|
|
563
|
+
isIntra and binPositionBox[0] <= binY <= binPositionBox[1] and binPositionBox[2] <= binX <=
|
|
564
|
+
binPositionBox[3])):
|
|
565
|
+
c = record['counts']
|
|
566
|
+
a = c1Norm[binX] * c2Norm[binY]
|
|
567
|
+
if a != 0.0:
|
|
568
|
+
c = (c / a)
|
|
569
|
+
else:
|
|
570
|
+
c = "inf"
|
|
571
|
+
xActual.append(binX)
|
|
572
|
+
yActual.append(binY)
|
|
573
|
+
counts.append(c)
|
|
574
|
+
else:
|
|
575
|
+
for record in records:
|
|
576
|
+
binX = record['binX']
|
|
577
|
+
binY = record['binY']
|
|
578
|
+
if ((binPositionBox[0] <= binX <= binPositionBox[1] and binPositionBox[2] <= binY <=
|
|
579
|
+
binPositionBox[3]) or (
|
|
580
|
+
isIntra and binPositionBox[0] <= binY <= binPositionBox[1] and binPositionBox[2] <= binX <=
|
|
581
|
+
binPositionBox[3])):
|
|
582
|
+
c = record['counts']
|
|
583
|
+
xActual.append(binX)
|
|
584
|
+
yActual.append(binY)
|
|
585
|
+
counts.append(c)
|
|
586
|
+
return xActual, yActual, counts
|
|
587
|
+
|
|
588
|
+
|
|
398
589
|
def readNormalizationVector(req):
|
|
399
590
|
""" Reads the normalization vector from the file; presumes file pointer is
|
|
400
591
|
in correct position
|
|
@@ -409,184 +600,188 @@ def readNormalizationVector(req):
|
|
|
409
600
|
"""
|
|
410
601
|
value = []
|
|
411
602
|
nValues = struct.unpack('<i', req.read(4))[0]
|
|
412
|
-
for
|
|
603
|
+
for i in range(nValues):
|
|
413
604
|
d = struct.unpack('<d', req.read(8))[0]
|
|
414
605
|
value.append(d)
|
|
415
606
|
return value
|
|
416
607
|
|
|
417
608
|
|
|
418
|
-
def
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
list1 = readFooter(req, c1, c2, norm, unit, binsize)
|
|
519
|
-
myFilePos = list1[0]
|
|
520
|
-
c1NormEntry = list1[1]
|
|
521
|
-
c2NormEntry = list1[2]
|
|
522
|
-
|
|
523
|
-
if (norm != "NONE"):
|
|
524
|
-
if (infile.startswith("http")):
|
|
525
|
-
endrange = 'bytes={0}-{1}'.format(c1NormEntry['position'], c1NormEntry['position'] + c1NormEntry['size'])
|
|
526
|
-
headers = {'range': endrange, 'x-amz-meta-requester': 'straw'}
|
|
527
|
-
r = s.get(infile, headers=headers);
|
|
528
|
-
req = io.BytesIO(r.content);
|
|
529
|
-
c1Norm = readNormalizationVector(req)
|
|
530
|
-
|
|
531
|
-
endrange = 'bytes={0}-{1}'.format(c2NormEntry['position'], c2NormEntry['position'] + c2NormEntry['size'])
|
|
532
|
-
headers = {'range': endrange, 'x-amz-meta-requester': 'straw'}
|
|
533
|
-
r = s.get(infile, headers=headers)
|
|
534
|
-
req = io.BytesIO(r.content)
|
|
535
|
-
else:
|
|
536
|
-
req.seek(c1NormEntry['position'])
|
|
537
|
-
c1Norm = readNormalizationVector(req)
|
|
538
|
-
req.seek(c2NormEntry['position'])
|
|
539
|
-
c2Norm = readNormalizationVector(req)
|
|
540
|
-
if (infile.startswith("http")):
|
|
541
|
-
headers = {'range': 'bytes={0}-'.format(myFilePos), 'x-amz-meta-requester': 'straw'}
|
|
542
|
-
r = s.get(infile, headers=headers, stream=True)
|
|
543
|
-
list1 = readMatrix(r.raw, unit, binsize)
|
|
544
|
-
else:
|
|
545
|
-
req.seek(myFilePos)
|
|
546
|
-
list1 = readMatrix(req, unit, binsize)
|
|
547
|
-
|
|
548
|
-
blockBinCount = list1[0]
|
|
549
|
-
blockColumnCount = list1[1]
|
|
550
|
-
blockNumbers = getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, c1 == c2)
|
|
551
|
-
yActual = []
|
|
552
|
-
xActual = []
|
|
553
|
-
counts = []
|
|
554
|
-
|
|
555
|
-
for i_set in (blockNumbers):
|
|
556
|
-
idx = {}
|
|
557
|
-
if (i_set in blockMap):
|
|
558
|
-
idx = blockMap[i_set]
|
|
559
|
-
else:
|
|
560
|
-
idx['size'] = 0
|
|
561
|
-
idx['position'] = 0
|
|
562
|
-
if (idx['size'] == 0):
|
|
563
|
-
records = []
|
|
609
|
+
def getHttpHeader(endrange, is_synapse):
|
|
610
|
+
if is_synapse:
|
|
611
|
+
return {'range': endrange}
|
|
612
|
+
return {'range': endrange, 'x-amz-meta-requester': 'straw'}
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def readLocalNorm(infile, position):
|
|
616
|
+
req = open(infile, 'rb')
|
|
617
|
+
req.seek(position)
|
|
618
|
+
return readNormalizationVector(req)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def readHttpNorm(infile, normEntry, is_synapse):
|
|
622
|
+
endrange = 'bytes={0}-{1}'.format(normEntry['position'], normEntry['position'] + normEntry['size'])
|
|
623
|
+
headers = getHttpHeader(endrange, is_synapse)
|
|
624
|
+
s = requests.Session()
|
|
625
|
+
r = s.get(infile, headers=headers);
|
|
626
|
+
req = io.BytesIO(r.content);
|
|
627
|
+
return readNormalizationVector(req)
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
class straw:
|
|
631
|
+
def __init__(self, infile, is_synapse=False):
|
|
632
|
+
""" This is the main workhorse method of the module. Reads a .hic file and
|
|
633
|
+
extracts the given contact matrix. Stores in an array in sparse upper
|
|
634
|
+
triangular format: row, column, (normalized) count
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
norm(str): Normalization type, one of VC, KR, VC_SQRT, or NONE
|
|
638
|
+
infile(str): File name or URL of .hic file
|
|
639
|
+
chr1loc(str): Chromosome name and (optionally) range, i.e. "1" or "1:10000:25000"
|
|
640
|
+
chr2loc(str): Chromosome name and (optionally) range, i.e. "1" or "1:10000:25000"
|
|
641
|
+
unit(str): One of BP or FRAG
|
|
642
|
+
binsize(int): Resolution, i.e. 25000 for 25K
|
|
643
|
+
"""
|
|
644
|
+
|
|
645
|
+
self.isHttpFile = infile.startswith("http")
|
|
646
|
+
self.infile = infile
|
|
647
|
+
self.is_synapse = is_synapse
|
|
648
|
+
self.master, self.version, totalbytes, self.chromDotSizes = readHeader(infile, is_synapse)
|
|
649
|
+
self.myFilePositions, self.normMap = readFooter(infile, is_synapse, self.master, totalbytes)
|
|
650
|
+
|
|
651
|
+
def getNormalizedMatrix(self, chr1, chr2, norm, unit, binsize):
|
|
652
|
+
|
|
653
|
+
if not (unit == "BP" or unit == "FRAG"):
|
|
654
|
+
print(
|
|
655
|
+
"Unit specified incorrectly, must be one of <BP/FRAG>\nUsage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>\n")
|
|
656
|
+
return None
|
|
657
|
+
|
|
658
|
+
for chrom in [chr1, chr2]:
|
|
659
|
+
if chrom not in self.chromDotSizes.data:
|
|
660
|
+
print(str(chrom) + " wasn't found in the file. Check that the chromosome name matches the genome.\n")
|
|
661
|
+
return None
|
|
662
|
+
|
|
663
|
+
chrIndex1 = self.chromDotSizes.getIndex(chr1)
|
|
664
|
+
chrIndex2 = self.chromDotSizes.getIndex(chr2)
|
|
665
|
+
isIntra = chrIndex1 == chrIndex2
|
|
666
|
+
|
|
667
|
+
neededToFlipIndices = False
|
|
668
|
+
if chrIndex1 > chrIndex2:
|
|
669
|
+
neededToFlipIndices = True
|
|
670
|
+
chrIndex1, chrIndex2 = chrIndex2, chrIndex1
|
|
671
|
+
chr1, chr2 = chr2, chr1
|
|
672
|
+
|
|
673
|
+
executor = concurrent.futures.ThreadPoolExecutor()
|
|
674
|
+
if norm != "NONE":
|
|
675
|
+
try:
|
|
676
|
+
c1NormEntry = self.normMap[norm][chrIndex1][unit][binsize]
|
|
677
|
+
except:
|
|
678
|
+
print(
|
|
679
|
+
"File did not contain {0} norm vectors for chr {1} at {2} {3}\n".format(norm, chr1, binsize, unit))
|
|
680
|
+
return None
|
|
681
|
+
|
|
682
|
+
if not isIntra:
|
|
683
|
+
try:
|
|
684
|
+
c2NormEntry = self.normMap[norm][chrIndex2][unit][binsize]
|
|
685
|
+
except:
|
|
686
|
+
print("File did not contain {0} norm vectors for chr {1} at {2} {3}\n".format(norm, chr2, binsize,
|
|
687
|
+
unit))
|
|
688
|
+
return None
|
|
689
|
+
if self.isHttpFile:
|
|
690
|
+
futureNorm1 = executor.submit(readHttpNorm, self.infile, c1NormEntry, self.is_synapse)
|
|
691
|
+
if not isIntra:
|
|
692
|
+
futureNorm2 = executor.submit(readHttpNorm, self.infile, c2NormEntry, self.is_synapse)
|
|
693
|
+
else:
|
|
694
|
+
futureNorm1 = executor.submit(readLocalNorm, self.infile, c1NormEntry['position'])
|
|
695
|
+
if not isIntra:
|
|
696
|
+
futureNorm2 = executor.submit(readLocalNorm, self.infile, c2NormEntry['position'])
|
|
697
|
+
|
|
698
|
+
blockMap = dict()
|
|
699
|
+
key = str(chrIndex1) + "_" + str(chrIndex2)
|
|
700
|
+
if key not in self.myFilePositions:
|
|
701
|
+
print("File doesn't have the given {0} map\n".format(key))
|
|
702
|
+
return None
|
|
703
|
+
myFilePos = self.myFilePositions[key][0]
|
|
704
|
+
if self.isHttpFile:
|
|
705
|
+
headers = getHttpHeader('bytes={0}-'.format(myFilePos), self.is_synapse)
|
|
706
|
+
s = requests.Session()
|
|
707
|
+
r = s.get(self.infile, headers=headers, stream=True)
|
|
708
|
+
futureMatrix = executor.submit(readMatrix, r.raw, unit, binsize, blockMap)
|
|
564
709
|
else:
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
710
|
+
req = open(self.infile, 'rb')
|
|
711
|
+
req.seek(myFilePos)
|
|
712
|
+
futureMatrix = executor.submit(readMatrix, req, unit, binsize, blockMap)
|
|
713
|
+
|
|
714
|
+
if norm != "NONE":
|
|
715
|
+
c1Norm = futureNorm1.result()
|
|
716
|
+
if isIntra:
|
|
717
|
+
c2Norm = c1Norm
|
|
570
718
|
else:
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
719
|
+
c2Norm = futureNorm2.result()
|
|
720
|
+
else:
|
|
721
|
+
c1Norm, c2Norm = None, None
|
|
722
|
+
|
|
723
|
+
blockBinCount, blockColumnCount = futureMatrix.result()
|
|
724
|
+
return normalizedmatrix(self.infile, self.is_synapse, binsize, isIntra, neededToFlipIndices, blockBinCount,
|
|
725
|
+
blockColumnCount, blockMap, norm, c1Norm, c2Norm, self.version)
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
class normalizedmatrix:
|
|
729
|
+
def __init__(self, infile, is_synapse, binsize, isIntra, neededToFlipIndices, blockBinCount, blockColumnCount,
|
|
730
|
+
blockMap, norm, c1Norm, c2Norm, version):
|
|
731
|
+
self.infile = infile
|
|
732
|
+
self.is_synapse = is_synapse
|
|
733
|
+
self.isHttpFile = infile.startswith("http")
|
|
734
|
+
self.binsize = binsize
|
|
735
|
+
self.isIntra = isIntra
|
|
736
|
+
self.neededToFlipIndices = neededToFlipIndices
|
|
737
|
+
self.blockBinCount = blockBinCount
|
|
738
|
+
self.blockColumnCount = blockColumnCount
|
|
739
|
+
self.norm = norm
|
|
740
|
+
self.c1Norm = c1Norm
|
|
741
|
+
self.c2Norm = c2Norm
|
|
742
|
+
self.blockMap = blockMap
|
|
743
|
+
self.version = version
|
|
744
|
+
|
|
745
|
+
def getDataFromBinRegion(self, X1, X2, Y1, Y2):
|
|
746
|
+
binsize = self.binsize
|
|
747
|
+
if self.neededToFlipIndices:
|
|
748
|
+
X1, X2, Y1, Y2 = Y1, Y2, X1, X2
|
|
749
|
+
binPositionsBox = []
|
|
750
|
+
binPositionsBox.append(int(X1))
|
|
751
|
+
binPositionsBox.append(int(X2))
|
|
752
|
+
binPositionsBox.append(int(Y1))
|
|
753
|
+
binPositionsBox.append(int(Y2))
|
|
754
|
+
|
|
755
|
+
blockNumbers = getBlockNumbersForRegionFromBinPosition(binPositionsBox, self.blockBinCount,
|
|
756
|
+
self.blockColumnCount, self.isIntra)
|
|
757
|
+
yActual = []
|
|
758
|
+
xActual = []
|
|
759
|
+
counts = []
|
|
760
|
+
|
|
761
|
+
executor = concurrent.futures.ProcessPoolExecutor()
|
|
762
|
+
futures = [
|
|
763
|
+
executor.submit(readBlockWorker, self.infile, self.is_synapse, bNum, binsize, self.blockMap, self.norm, \
|
|
764
|
+
self.c1Norm, self.c2Norm, binPositionsBox, self.isIntra, self.version) for bNum in
|
|
765
|
+
blockNumbers]
|
|
766
|
+
|
|
767
|
+
for future in futures:
|
|
768
|
+
xTemp, yTemp, cTemp = future.result()
|
|
769
|
+
xActual.extend(xTemp)
|
|
770
|
+
yActual.extend(yTemp)
|
|
771
|
+
counts.extend(cTemp)
|
|
772
|
+
return [xActual, yActual, counts]
|
|
773
|
+
|
|
774
|
+
def getDataFromGenomeRegion(self, X1, X2, Y1, Y2):
|
|
775
|
+
binsize = self.binsize
|
|
776
|
+
return self.getDataFromBinRegion(X1 / binsize, math.ceil(X2 / binsize), Y1 / binsize, math.ceil(Y2 / binsize))
|
|
777
|
+
|
|
778
|
+
def getBatchedDataFromGenomeRegion(self, listOfCoordinates):
|
|
779
|
+
executor = concurrent.futures.ThreadPoolExecutor()
|
|
780
|
+
futures = [executor.submit(self.getDataFromGenomeRegion, a, b, c, d) for (a, b, c, d) in listOfCoordinates]
|
|
781
|
+
finalResults = list()
|
|
782
|
+
for future in futures:
|
|
783
|
+
finalResults.append(future.result())
|
|
784
|
+
return finalResults
|
|
590
785
|
|
|
591
786
|
|
|
592
787
|
def printme(norm, infile, chr1loc, chr2loc, unit, binsize, outfile):
|
|
@@ -602,8 +797,16 @@ def printme(norm, infile, chr1loc, chr2loc, unit, binsize, outfile):
|
|
|
602
797
|
binsize(int): Resolution, i.e. 25000 for 25K
|
|
603
798
|
outfile(str): Name of text file to write to
|
|
604
799
|
"""
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
800
|
+
f = open(outfile, 'w')
|
|
801
|
+
strawObj = straw(infile)
|
|
802
|
+
|
|
803
|
+
chr1, X1, X2 = strawObj.chromDotSizes.figureOutEndpoints(chr1loc)
|
|
804
|
+
chr2, Y1, Y2 = strawObj.chromDotSizes.figureOutEndpoints(chr2loc)
|
|
805
|
+
|
|
806
|
+
matrxObj = strawObj.getNormalizedMatrix(chr1, chr2, norm, unit, binsize)
|
|
807
|
+
|
|
808
|
+
result = matrxObj.getDataFromGenomeRegion(X1, X2, Y1, Y2)
|
|
809
|
+
|
|
810
|
+
for i in range(len(result[0])):
|
|
811
|
+
f.write("{0}\t{1}\t{2}\n".format(result[0][i], result[1][i], result[2][i]))
|
|
812
|
+
f.close()
|