coolbox 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of coolbox might be problematic. Click here for more details.

Files changed (35) hide show
  1. coolbox/__init__.py +1 -1
  2. coolbox/cli.py +0 -2
  3. coolbox/core/browser/base.py +5 -2
  4. coolbox/core/coverage/__init__.py +1 -1
  5. coolbox/core/coverage/highlights.py +4 -4
  6. coolbox/core/frame/frame.py +16 -6
  7. coolbox/core/track/__init__.py +2 -1
  8. coolbox/core/track/arcs/plot.py +6 -2
  9. coolbox/core/track/bed/__init__.py +0 -1
  10. coolbox/core/track/bed/base.py +93 -85
  11. coolbox/core/track/bed/bed.py +37 -16
  12. coolbox/core/track/bed/fetch.py +1 -1
  13. coolbox/core/track/bed/plot.py +71 -221
  14. coolbox/core/track/gtf.py +11 -9
  15. coolbox/core/track/hicmat/base.py +12 -9
  16. coolbox/core/track/hicmat/cool.py +6 -5
  17. coolbox/core/track/hicmat/dothic.py +4 -3
  18. coolbox/core/track/hicmat/hicmat.py +8 -9
  19. coolbox/core/track/hicmat/plot.py +12 -6
  20. coolbox/core/track/hist/__init__.py +10 -3
  21. coolbox/core/track/hist/bigwig.py +0 -16
  22. coolbox/core/track/hist/plot.py +13 -5
  23. coolbox/core/track/ideogram.py +19 -10
  24. coolbox/core/track/pseudo.py +6 -2
  25. coolbox/core/track/tad.py +237 -0
  26. coolbox/utilities/bed.py +1 -1
  27. coolbox/utilities/hic/straw.py +532 -329
  28. coolbox/utilities/hic/wrap.py +55 -24
  29. {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/METADATA +20 -11
  30. {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/RECORD +34 -34
  31. {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/WHEEL +1 -1
  32. coolbox/core/track/bed/tad.py +0 -18
  33. {coolbox-0.3.7.data → coolbox-0.3.9.data}/scripts/coolbox +0 -0
  34. {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/LICENSE +0 -0
  35. {coolbox-0.3.7.dist-info → coolbox-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,48 +1,36 @@
1
1
  """
2
- Module for load '.hic' data, from:
3
-
4
- https://github.com/aidenlab/straw/blob/master/python/straw.py
5
-
6
- ------
7
-
8
2
  Straw module
9
3
 
10
4
  Straw enables programmatic access to .hic files.
11
5
  .hic files store the contact matrices from Hi-C experiments and the
12
6
  normalization and expected vectors, along with meta-data in the header.
13
7
 
14
- The main function, straw, takes in the normalization, the filename or URL,
15
- chromosome1 (and optional range), chromosome2 (and optional range),
16
- whether the bins desired are fragment or base pair delimited, and bin size.
17
-
18
- It then reads the header, follows the various pointers to the desired matrix
19
- and normalization vector, and stores as [x, y, count]
20
-
21
- Usage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <\
22
- BP/FRAG> <binsize>
8
+ Usage: strawObj = straw <hicFile(s)>
9
+ matrixObj = strawObj.getNormalizedMatrix <chr1> <chr2> <NONE/VC/VC_SQRT/KR> <BP/FRAG> <binsize>
10
+ data = matrixObj.getDataFromBinRegion <x1,x2,y1,y2>
23
11
 
24
12
  Example:
25
- >>>import straw
26
- >>>result = straw.straw('NONE', 'HIC001.hic', 'X', 'X', 'BP', 1000000)
27
- >>>for i in range(len(result[0])):
13
+ import straw
14
+ strawObj = straw(filename)
15
+ matrixObj = strawObj.getNormalizedMatrix('5', '5', 'KR', 'BP', 5000)
16
+ result = matrixObj.getDataFromBinRegion(0,500,0,500)
17
+ for i in range(len(result[0])):
28
18
  ... print("{0}\t{1}\t{2}".format(result[0][i], result[1][i], result[2][i]))
29
19
 
30
20
  See https://github.com/theaidenlab/straw/wiki/Python for more documentation
31
21
  """
32
-
33
22
  from __future__ import absolute_import, division, print_function, unicode_literals
34
23
 
35
- __author__ = "Yue Wu and Neva Durand"
24
+ __author__ = "Yue Wu, Neva Durand, Yossi Eliaz, Muhammad Shamim, Erez Aiden"
36
25
  __license__ = "MIT"
37
26
 
38
- import sys
39
27
  import struct
40
28
  import zlib
29
+ import requests
41
30
  import io
42
-
43
- blockMap = {}
44
- # global version
45
- version = 0
31
+ import concurrent.futures
32
+ import math
33
+ import sys
46
34
 
47
35
 
48
36
  def __readcstr(f):
@@ -62,69 +50,197 @@ def __readcstr(f):
62
50
  readcstr = __readcstr
63
51
 
64
52
 
65
- def readHeader(req, chr1, chr2, posilist):
53
+ """
54
+ functions for chrom.sizes
55
+ internal representation is a dictionary with
56
+ chromosome name as the key
57
+ value maps to a tuple containing the index and chromosome length
58
+ """
59
+
60
+
61
+ class ChromDotSizes:
62
+ def __init__(self, data):
63
+ self.data = data
64
+
65
+ def getLength(self, chrom):
66
+ try:
67
+ return int(self.data[chrom][1])
68
+ except:
69
+ print(str(chrom) + " not in chrom.sizes. Check that the chromosome name matches the genome.\n")
70
+ return None
71
+
72
+ def getIndex(self, chrom):
73
+ try:
74
+ return int(self.data[chrom][0])
75
+ except:
76
+ print(str(chrom) + " not in chrom.sizes. Check that the chromosome name matches the genome.\n")
77
+ return None
78
+
79
+ def figureOutEndpoints(self, chrAndPositions):
80
+ chrAndPositionsArray = chrAndPositions.split(":")
81
+ chrom = chrAndPositionsArray[0]
82
+
83
+ indx1 = 0
84
+ indx2 = self.getLength(chrom)
85
+
86
+ if len(chrAndPositionsArray) == 3:
87
+ indx1 = int(chrAndPositionsArray[1])
88
+ indx2 = int(chrAndPositionsArray[2])
89
+
90
+ return chrom, indx1, indx2
91
+
92
+
93
+ def read_metadata(infile, verbose=False):
94
+ """
95
+ Reads the metadata of HiC file from header.
96
+
97
+ Args
98
+ infile: str, path to the HiC file
99
+ verbose: bool
100
+
101
+ Returns
102
+ metadata: dict, containing the metadata.
103
+ Keys of the metadata:
104
+ HiC version,
105
+ Master index,
106
+ Genome ID (str),
107
+ Attribute dictionary (dict),
108
+ Chromosomes (dict),
109
+ Base pair-delimited resolutions (list),
110
+ Fragment-delimited resolutions (list).
111
+ """
112
+ metadata = {}
113
+ import io
114
+ import struct
115
+ if (infile.startswith("http")):
116
+ # try URL first. 100K should be sufficient for header
117
+ headers = {'range': 'bytes=0-100000', 'x-amz-meta-requester': 'straw'}
118
+ s = requests.Session()
119
+ r = s.get(infile, headers=headers)
120
+ if (r.status_code >= 400):
121
+ print("Error accessing " + infile)
122
+ print("HTTP status code " + str(r.status_code))
123
+ sys.exit(1)
124
+ req = io.BytesIO(r.content)
125
+ myrange = r.headers['content-range'].split('/')
126
+ totalbytes = myrange[1]
127
+ else:
128
+ req = open(infile, 'rb')
129
+ magic_string = struct.unpack('<3s', req.read(3))[0]
130
+ req.read(1)
131
+ if (magic_string != b"HIC"):
132
+ sys.exit('This does not appear to be a HiC file magic string is incorrect')
133
+ version = struct.unpack('<i', req.read(4))[0]
134
+ metadata['HiC version'] = version
135
+ masterindex = struct.unpack('<q', req.read(8))[0]
136
+ metadata['Master index'] = masterindex
137
+ genome = ""
138
+ c = req.read(1).decode("utf-8")
139
+ while (c != '\0'):
140
+ genome += c
141
+ c = req.read(1).decode("utf-8")
142
+ metadata['Genome ID'] = genome
143
+ if (version > 8):
144
+ nvi = struct.unpack('<q', req.read(8))[0]
145
+ nvisize = struct.unpack('<q', req.read(8))[0]
146
+ metadata['NVI'] = nvi
147
+ metadata['NVI size'] = nvisize
148
+ ## read and throw away attribute dictionary (stats+graphs)
149
+ nattributes = struct.unpack('<i', req.read(4))[0]
150
+ d = {}
151
+ for x in range(0, nattributes):
152
+ key = __readcstr(req)
153
+ value = __readcstr(req)
154
+ d[key] = value
155
+ metadata['Attribute dictionary'] = d
156
+ nChrs = struct.unpack('<i', req.read(4))[0]
157
+ d = {}
158
+ for x in range(0, nChrs):
159
+ key = __readcstr(req)
160
+ if (version > 8):
161
+ value = struct.unpack('q', req.read(8))[0]
162
+ else:
163
+ value = struct.unpack('<i', req.read(4))[0]
164
+ d[key] = value
165
+ metadata["Chromosomes"] = d
166
+ nBpRes = struct.unpack('<i', req.read(4))[0]
167
+ l = []
168
+ for x in range(0, nBpRes):
169
+ res = struct.unpack('<i', req.read(4))[0]
170
+ l.append(res)
171
+ metadata["Base pair-delimited resolutions"] = l
172
+ nFrag = struct.unpack('<i', req.read(4))[0]
173
+ l = []
174
+ for x in range(0, nFrag):
175
+ res = struct.unpack('<i', req.read(4))[0]
176
+ l.append(res)
177
+ metadata["Fragment-delimited resolutions"] = l
178
+ for k in metadata:
179
+ if k != 'Attribute dictionary':
180
+ print(k, ':', metadata[k])
181
+ if verbose:
182
+ print('Attribute dictionary', ':', metadata['Attribute dictionary'])
183
+ return metadata
184
+
185
+
186
+ def readHeader(infile, is_synapse):
66
187
  """ Reads the header
67
188
 
68
189
  Args:
69
- req (file): File to read from
70
- chr1 (str): Chromosome 1
71
- chr2 (str): Chromosome 2
72
- c1pos1 (int, optional): Starting range of chromosome1 output
73
- c1pos2 (int, optional): Stopping range of chromosome1 output
74
- c2pos1 (int, optional): Starting range of chromosome2 output
75
- c2pos2 (int, optional): Stopping range of chromosome2 output
190
+ input file, is_synapse
76
191
 
77
192
  Returns:
78
- list: master index, chromosome1 index, chromosome2 index
193
+ list: master index, version number, size of totalbytes, chromDotSizes
79
194
  """
195
+
196
+ if infile.startswith("http"):
197
+ # try URL first. 100K should be sufficient for header
198
+ headers = getHttpHeader('bytes=0-100000', is_synapse)
199
+ s = requests.Session()
200
+ r = s.get(infile, headers=headers)
201
+ if r.status_code >= 400:
202
+ print("Error accessing " + infile)
203
+ print("HTTP status code " + str(r.status_code))
204
+ return -1
205
+ req = io.BytesIO(r.content)
206
+ myrange = r.headers['content-range'].split('/')
207
+ totalbytes = myrange[1]
208
+ else:
209
+ req = open(infile, 'rb')
210
+ totalbytes = None
211
+
80
212
  magic_string = struct.unpack('<3s', req.read(3))[0]
81
213
  req.read(1)
82
- if (magic_string != b"HIC"):
214
+ if magic_string != b"HIC":
83
215
  print('This does not appear to be a HiC file magic string is incorrect')
84
216
  return -1
85
- global version
86
217
  version = struct.unpack('<i', req.read(4))[0]
87
- if (version < 6):
218
+ if version < 6:
88
219
  print("Version {0} no longer supported".format(str(version)))
89
220
  return -1
90
- # print('HiC version:' + ' {0}'.format(str(version)))
221
+ #print('HiC version:' + ' {0}'.format(str(version)))
91
222
  master = struct.unpack('<q', req.read(8))[0]
92
223
  genome = b""
93
224
  c = req.read(1)
94
- while (c != b'\0'):
225
+ while c != b'\0':
95
226
  genome += c
96
227
  c = req.read(1)
97
228
 
98
229
  # read and throw away attribute dictionary (stats+graphs)
99
230
  nattributes = struct.unpack('<i', req.read(4))[0]
100
- for _ in range(nattributes):
231
+ for x in range(nattributes):
101
232
  key = __readcstr(req)
102
233
  value = __readcstr(req)
103
234
  nChrs = struct.unpack('<i', req.read(4))[0]
104
- found1 = False
105
- found2 = False
106
- for i in range(nChrs):
235
+ chromDotSizes = {}
236
+ for i in range(0, nChrs):
107
237
  name = __readcstr(req)
108
238
  length = struct.unpack('<i', req.read(4))[0]
109
- if (name == chr1):
110
- found1 = True
111
- chr1ind = i
112
- if (posilist[0] == -100):
113
- posilist[0] = 0
114
- posilist[1] = length
115
- if (name == chr2):
116
- found2 = True
117
- chr2ind = i
118
- if (posilist[2] == -100):
119
- posilist[2] = 0
120
- posilist[3] = length
121
- if ((not found1) or (not found2)):
122
- print("One of the chromosomes wasn't found in the file. Check that the chromosome name matches the genome.\n")
123
- return -1
124
- return [master, chr1ind, chr2ind, posilist[0], posilist[1], posilist[2], posilist[3]]
239
+ chromDotSizes[name] = (i, length)
240
+ return master, version, totalbytes, ChromDotSizes(chromDotSizes)
125
241
 
126
242
 
127
- def readFooter(req, c1, c2, norm, unit, resolution):
243
+ def readFooter(infile, is_synapse, master, totalbytes):
128
244
  """Reads the footer, which contains all the expected and normalization
129
245
  vectors. Presumes file pointer is in correct position
130
246
  Args:
@@ -140,73 +256,77 @@ def readFooter(req, c1, c2, norm, unit, resolution):
140
256
  list: File position of matrix, position+size chr1 normalization vector,
141
257
  position+size chr2 normalization vector
142
258
  """
143
- c1NormEntry = {}
144
- c2NormEntry = {}
259
+ if infile.startswith("http"):
260
+ headers = getHttpHeader('bytes={0}-{1}'.format(master, totalbytes), is_synapse)
261
+ s = requests.Session()
262
+ r = s.get(infile, headers=headers)
263
+ req = io.BytesIO(r.content)
264
+ else:
265
+ req = open(infile, 'rb')
266
+ req.seek(master)
267
+
268
+ filePositions = dict()
145
269
  nBytes = struct.unpack('<i', req.read(4))[0]
146
- key = str(c1) + "_" + str(c2)
147
270
  nEntries = struct.unpack('<i', req.read(4))[0]
148
- found = False
149
- for _ in range(nEntries):
150
- stri = __readcstr(req)
271
+
272
+ for i in range(nEntries):
273
+ key = __readcstr(req)
151
274
  fpos = struct.unpack('<q', req.read(8))[0]
152
275
  sizeinbytes = struct.unpack('<i', req.read(4))[0]
153
- if (stri == key):
154
- myFilePos = fpos
155
- found = True
156
- if (not found):
157
- print("File doesn't have the given chr_chr map\n")
158
- if (norm == "NONE"):
159
- return [myFilePos, 0, 0]
276
+ filePositions[key] = (fpos, sizeinbytes)
277
+
278
+ # later save these
160
279
  nExpectedValues = struct.unpack('<i', req.read(4))[0]
161
- for _ in range(nExpectedValues):
162
- str_ = __readcstr(req)
280
+ for i in range(nExpectedValues):
281
+ key = __readcstr(req)
163
282
  binSize = struct.unpack('<i', req.read(4))[0]
164
283
  nValues = struct.unpack('<i', req.read(4))[0]
165
- for _ in range(nValues):
284
+ for j in range(nValues):
285
+ # replace with vector.append
166
286
  v = struct.unpack('<d', req.read(8))[0]
167
287
  nNormalizationFactors = struct.unpack('<i', req.read(4))[0]
168
- for _ in range(nNormalizationFactors):
288
+ for j in range(nNormalizationFactors):
289
+ # replace with vector.append
169
290
  chrIdx = struct.unpack('<i', req.read(4))[0]
170
291
  v = struct.unpack('<d', req.read(8))[0]
171
292
  nExpectedValues = struct.unpack('<i', req.read(4))[0]
172
- for _ in range(nExpectedValues):
293
+ for i in range(nExpectedValues):
173
294
  str_ = __readcstr(req)
174
295
  str_ = __readcstr(req)
175
296
  binSize = struct.unpack('<i', req.read(4))[0]
176
297
  nValues = struct.unpack('<i', req.read(4))[0]
177
- for _ in range(nValues):
298
+ for j in range(nValues):
178
299
  v = struct.unpack('<d', req.read(8))[0]
179
300
  nNormalizationFactors = struct.unpack('<i', req.read(4))[0]
180
- for _ in range(nNormalizationFactors):
301
+ for j in range(nNormalizationFactors):
181
302
  chrIdx = struct.unpack('<i', req.read(4))[0]
182
303
  v = struct.unpack('<d', req.read(8))[0]
304
+
305
+ normMap = dict()
183
306
  nEntries = struct.unpack('<i', req.read(4))[0]
184
- found1 = False
185
- found2 = False
186
- for _ in range(nEntries):
307
+ for i in range(nEntries):
187
308
  normtype = __readcstr(req)
309
+ if normtype not in normMap:
310
+ normMap[normtype] = {}
188
311
  chrIdx = struct.unpack('<i', req.read(4))[0]
189
- unit1 = __readcstr(req)
190
- resolution1 = struct.unpack('<i', req.read(4))[0]
312
+ if chrIdx not in normMap[normtype]:
313
+ normMap[normtype][chrIdx] = {}
314
+ unit = __readcstr(req)
315
+ if unit not in normMap[normtype][chrIdx]:
316
+ normMap[normtype][chrIdx][unit] = {}
317
+ resolution = struct.unpack('<i', req.read(4))[0]
318
+ if resolution not in normMap[normtype][chrIdx][unit]:
319
+ normMap[normtype][chrIdx][unit][resolution] = {}
191
320
  filePosition = struct.unpack('<q', req.read(8))[0]
192
321
  sizeInBytes = struct.unpack('<i', req.read(4))[0]
193
- if (chrIdx == c1 and normtype == norm and unit1 == unit and resolution1 == resolution):
194
- c1NormEntry['position'] = filePosition
195
- c1NormEntry['size'] = sizeInBytes
196
- found1 = True
197
- if (chrIdx == c2 and normtype == norm and unit1 == unit and resolution1 == resolution):
198
- c2NormEntry['position'] = filePosition
199
- c2NormEntry['size'] = sizeInBytes
200
- found2 = True
201
- if ((not found1) or (not found2)):
202
- print("File did not contain {0} normalization vectors for one or both chromosomes at {1} {2}\n".format(norm,
203
- resolution,
204
- unit))
205
- return -1
206
- return [myFilePos, c1NormEntry, c2NormEntry]
322
+
323
+ normMap[normtype][chrIdx][unit][resolution]['position'] = filePosition
324
+ normMap[normtype][chrIdx][unit][resolution]['size'] = sizeInBytes
325
+
326
+ return filePositions, normMap
207
327
 
208
328
 
209
- def readMatrixZoomData(req, myunit, mybinsize):
329
+ def readMatrixZoomData(req, myunit, mybinsize, blockMap):
210
330
  """ Reads the Matrix Zoom Data, which gives pointer list for blocks for
211
331
  the data. Presumes file pointer is in correct position
212
332
 
@@ -222,10 +342,10 @@ def readMatrixZoomData(req, myunit, mybinsize):
222
342
  """
223
343
  unit = __readcstr(req)
224
344
  temp = struct.unpack('<i', req.read(4))[0]
225
- temp2 = struct.unpack('<f', req.read(4))[0]
226
- temp2 = struct.unpack('<f', req.read(4))[0]
227
- temp2 = struct.unpack('<f', req.read(4))[0]
228
- temp2 = struct.unpack('<f', req.read(4))[0]
345
+ temp = struct.unpack('<f', req.read(4))[0]
346
+ temp = struct.unpack('<f', req.read(4))[0]
347
+ temp = struct.unpack('<f', req.read(4))[0]
348
+ temp = struct.unpack('<f', req.read(4))[0]
229
349
  binSize = struct.unpack('<i', req.read(4))[0]
230
350
  blockBinCount = struct.unpack('<i', req.read(4))[0]
231
351
  blockColumnCount = struct.unpack('<i', req.read(4))[0]
@@ -233,22 +353,24 @@ def readMatrixZoomData(req, myunit, mybinsize):
233
353
  # for the initial
234
354
  myBlockBinCount = -1
235
355
  myBlockColumnCount = -1
236
- if (myunit == unit and mybinsize == binSize):
356
+ if myunit == unit and mybinsize == binSize:
237
357
  myBlockBinCount = blockBinCount
238
358
  myBlockColumnCount = blockColumnCount
239
359
  storeBlockData = True
240
360
  nBlocks = struct.unpack('<i', req.read(4))[0]
241
- for _ in range(nBlocks):
242
- if (storeBlockData):
243
- blockNumber = struct.unpack('<i', req.read(4))[0]
244
- filePosition = struct.unpack('<q', req.read(8))[0]
245
- blockSizeInBytes = struct.unpack('<i', req.read(4))[0]
246
- entry = {'size': blockSizeInBytes, 'position': filePosition}
361
+ for b in range(nBlocks):
362
+ blockNumber = struct.unpack('<i', req.read(4))[0]
363
+ filePosition = struct.unpack('<q', req.read(8))[0]
364
+ blockSizeInBytes = struct.unpack('<i', req.read(4))[0]
365
+ entry = dict()
366
+ entry['size'] = blockSizeInBytes
367
+ entry['position'] = filePosition
368
+ if storeBlockData:
247
369
  blockMap[blockNumber] = entry
248
- return [storeBlockData, myBlockBinCount, myBlockColumnCount]
370
+ return storeBlockData, myBlockBinCount, myBlockColumnCount
249
371
 
250
372
 
251
- def readMatrix(req, unit, binsize):
373
+ def readMatrix(req, unit, binsize, blockMap):
252
374
  """ Reads the matrix - that is, finds the appropriate pointers to block
253
375
  data and stores them. Needs to read through headers of zoom data to find
254
376
  appropriate matrix. Presumes file pointer is in correct position.
@@ -261,6 +383,9 @@ def readMatrix(req, unit, binsize):
261
383
 
262
384
  Returns:
263
385
  list containing block bin count and block column count of matrix
386
+
387
+ Raises:
388
+ ValueError if the .hic file can't be parsed with the specified resolution (binsize)
264
389
  """
265
390
  c1 = struct.unpack('<i', req.read(4))[0]
266
391
  c2 = struct.unpack('<i', req.read(4))[0]
@@ -269,17 +394,13 @@ def readMatrix(req, unit, binsize):
269
394
  found = False
270
395
  blockBinCount = -1
271
396
  blockColumnCount = -1
272
- while (i < nRes and (not found)):
273
- list1 = readMatrixZoomData(req, unit, binsize)
274
- found = list1[0]
275
- if (list1[1] != -1 and list1[2] != -1):
276
- blockBinCount = list1[1]
277
- blockColumnCount = list1[2]
278
- i += 1
279
- if (not found):
280
- print("Error finding block data\n")
281
- return -1
282
- return [blockBinCount, blockColumnCount]
397
+ while i < nRes and (not found):
398
+ found, blockBinCount, blockColumnCount = readMatrixZoomData(req, unit, binsize, blockMap)
399
+ i = i + 1
400
+ if not found:
401
+ raise ValueError(f"Error: could not parse .hic file using specified resolution/bin-size ({binsize})")
402
+
403
+ return blockBinCount, blockColumnCount
283
404
 
284
405
 
285
406
  def getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, intra):
@@ -300,21 +421,21 @@ def getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockC
300
421
  row1 = int(regionIndices[2] / blockBinCount)
301
422
  row2 = int((regionIndices[3] + 1) / blockBinCount)
302
423
  blocksSet = set()
303
- # print(str(col1)+"\t"+str(col2)+"\t"+str(row1)+"\t"+str(row2))
424
+
304
425
  for r in range(row1, row2 + 1):
305
426
  for c in range(col1, col2 + 1):
306
427
  blockNumber = r * blockColumnCount + c
307
428
  blocksSet.add(blockNumber)
308
- if (intra):
429
+ # in Java code, this is "if getBelowDiagonal"
430
+ if intra and col2 > row1:
309
431
  for r in range(col1, col2 + 1):
310
432
  for c in range(row1, row2 + 1):
311
433
  blockNumber = r * blockColumnCount + c
312
434
  blocksSet.add(blockNumber)
313
- # print(str(blocksSet))
314
435
  return blocksSet
315
436
 
316
437
 
317
- def readBlock(req, size):
438
+ def readBlock(req, size, version):
318
439
  """ Reads the block - reads the compressed bytes, decompresses, and stores
319
440
  results in array. Presumes file pointer is in correct position.
320
441
 
@@ -330,13 +451,15 @@ def readBlock(req, size):
330
451
  uncompressedBytes = zlib.decompress(compressedBytes)
331
452
  nRecords = struct.unpack('<i', uncompressedBytes[0:4])[0]
332
453
  v = []
333
- global version
334
- if (version < 7):
454
+ if version < 7:
335
455
  for i in range(nRecords):
336
456
  binX = struct.unpack('<i', uncompressedBytes[(12 * i + 4):(12 * i + 8)])[0]
337
457
  binY = struct.unpack('<i', uncompressedBytes[(12 * i + 8):(12 * i + 12)])[0]
338
458
  counts = struct.unpack('<f', uncompressedBytes[(12 * i + 12):(12 * i + 16)])[0]
339
- record = {'binX': binX, 'binY': binY, 'counts': counts}
459
+ record = dict()
460
+ record['binX'] = binX
461
+ record['binY'] = binY
462
+ record['counts'] = counts
340
463
  v.append(record)
341
464
  else:
342
465
  binXOffset = struct.unpack('<i', uncompressedBytes[4:8])[0]
@@ -344,57 +467,125 @@ def readBlock(req, size):
344
467
  useShort = struct.unpack('<b', uncompressedBytes[12:13])[0]
345
468
  type_ = struct.unpack('<b', uncompressedBytes[13:14])[0]
346
469
  index = 0
347
- if (type_ == 1):
470
+ if type_ == 1:
348
471
  rowCount = struct.unpack('<h', uncompressedBytes[14:16])[0]
349
472
  temp = 16
350
- for _ in range(rowCount):
473
+ for i in range(rowCount):
351
474
  y = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
352
- temp += 2
475
+ temp = temp + 2
353
476
  binY = y + binYOffset
354
477
  colCount = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
355
- temp += 2
356
- for _ in range(colCount):
478
+ temp = temp + 2
479
+ for j in range(colCount):
357
480
  x = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
358
- temp += 2
481
+ temp = temp + 2
359
482
  binX = binXOffset + x
360
- if (useShort == 0):
483
+ if useShort == 0:
361
484
  c = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
362
- temp += 2
485
+ temp = temp + 2
363
486
  counts = c
364
487
  else:
365
488
  counts = struct.unpack('<f', uncompressedBytes[temp:(temp + 4)])[0]
366
- temp += 4
367
- record = {'binX': binX, 'binY': binY, 'counts': counts}
489
+ temp = temp + 4
490
+ record = dict()
491
+ record['binX'] = binX
492
+ record['binY'] = binY
493
+ record['counts'] = counts
368
494
  v.append(record)
369
- index += 1
495
+ index = index + 1
370
496
  elif type_ == 2:
371
497
  temp = 14
372
498
  nPts = struct.unpack('<i', uncompressedBytes[temp:(temp + 4)])[0]
373
- temp += 4
499
+ temp = temp + 4
374
500
  w = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
375
- temp += 2
501
+ temp = temp + 2
376
502
  for i in range(nPts):
377
503
  row = int(i / w)
378
504
  col = i - row * w
379
505
  bin1 = int(binXOffset + col)
380
506
  bin2 = int(binYOffset + row)
381
- if (useShort == 0):
507
+ if useShort == 0:
382
508
  c = struct.unpack('<h', uncompressedBytes[temp:(temp + 2)])[0]
383
- temp += 2
384
- if (c != -32768):
385
- record = {'binX': bin1, 'binY': bin2, 'counts': c}
509
+ temp = temp + 2
510
+ if c != -32768:
511
+ record = dict()
512
+ record['binX'] = bin1
513
+ record['binY'] = bin2
514
+ record['counts'] = c
386
515
  v.append(record)
387
- index += 1
516
+ index = index + 1
388
517
  else:
389
518
  counts = struct.unpack('<f', uncompressedBytes[temp:(temp + 4)])[0]
390
- temp += 4
391
- if (countsnot != 0x7fc00000):
392
- record = {'binX': bin1, 'binY': bin2, 'counts': counts}
519
+ temp = temp + 4
520
+ if counts != 0x7fc00000:
521
+ record = dict()
522
+ record['binX'] = bin1
523
+ record['binY'] = bin2
524
+ record['counts'] = counts
393
525
  v.append(record)
394
- index += 1
526
+ index = index + 1
395
527
  return v
396
528
 
397
529
 
530
+ def readBlockWorker(infile, is_synapse, blockNum, binsize, blockMap, norm, c1Norm, c2Norm, binPositionBox, isIntra,
531
+ version):
532
+ yActual = []
533
+ xActual = []
534
+ counts = []
535
+ idx = dict()
536
+ if blockNum in blockMap:
537
+ idx = blockMap[blockNum]
538
+ else:
539
+ idx['size'] = 0
540
+ idx['position'] = 0
541
+
542
+ if idx['size'] == 0:
543
+ records = []
544
+ else:
545
+ if infile.startswith("http"):
546
+ headers = getHttpHeader('bytes={0}-{1}'.format(idx['position'], idx['position'] + idx['size']), is_synapse)
547
+ s = requests.Session()
548
+ r = s.get(infile, headers=headers);
549
+ req = io.BytesIO(r.content)
550
+ else:
551
+ req = open(infile, 'rb')
552
+ req.seek(idx['position'])
553
+ records = readBlock(req, idx['size'], version)
554
+
555
+ # No caching currently; in Java code we keep all records and check positions later
556
+ if norm != "NONE":
557
+ for record in records:
558
+ binX = record['binX']
559
+ binY = record['binY']
560
+
561
+ if ((binPositionBox[0] <= binX <= binPositionBox[1] and binPositionBox[2] <= binY <=
562
+ binPositionBox[3]) or (
563
+ isIntra and binPositionBox[0] <= binY <= binPositionBox[1] and binPositionBox[2] <= binX <=
564
+ binPositionBox[3])):
565
+ c = record['counts']
566
+ a = c1Norm[binX] * c2Norm[binY]
567
+ if a != 0.0:
568
+ c = (c / a)
569
+ else:
570
+ c = "inf"
571
+ xActual.append(binX)
572
+ yActual.append(binY)
573
+ counts.append(c)
574
+ else:
575
+ for record in records:
576
+ binX = record['binX']
577
+ binY = record['binY']
578
+ if ((binPositionBox[0] <= binX <= binPositionBox[1] and binPositionBox[2] <= binY <=
579
+ binPositionBox[3]) or (
580
+ isIntra and binPositionBox[0] <= binY <= binPositionBox[1] and binPositionBox[2] <= binX <=
581
+ binPositionBox[3])):
582
+ c = record['counts']
583
+ xActual.append(binX)
584
+ yActual.append(binY)
585
+ counts.append(c)
586
+ return xActual, yActual, counts
587
+
588
+
398
589
  def readNormalizationVector(req):
399
590
  """ Reads the normalization vector from the file; presumes file pointer is
400
591
  in correct position
@@ -409,184 +600,188 @@ def readNormalizationVector(req):
409
600
  """
410
601
  value = []
411
602
  nValues = struct.unpack('<i', req.read(4))[0]
412
- for _ in range(nValues):
603
+ for i in range(nValues):
413
604
  d = struct.unpack('<d', req.read(8))[0]
414
605
  value.append(d)
415
606
  return value
416
607
 
417
608
 
418
- def straw(norm, infile, chr1loc, chr2loc, unit, binsize):
419
- """ This is the main workhorse method of the module. Reads a .hic file and
420
- extracts the given contact matrix. Stores in an array in sparse upper
421
- triangular format: row, column, (normalized) count
422
-
423
- Args:
424
- norm(str): Normalization type, one of VC, KR, VC_SQRT, or NONE
425
- infile(str): File name or URL of .hic file
426
- chr1loc(str): Chromosome name and (optionally) range, i.e. "1" or "1:10000:25000"
427
- chr2loc(str): Chromosome name and (optionally) range, i.e. "1" or "1:10000:25000"
428
- unit(str): One of BP or FRAG
429
- binsize(int): Resolution, i.e. 25000 for 25K
430
- """
431
- # clear the global variable blockMap so that it won't keep the data from previous calls
432
- for blockNum in list(blockMap.keys()):
433
- blockMap.pop(blockNum)
434
-
435
- magic_string = ""
436
- if (infile.startswith("http")):
437
- # try URL first. 100K should be sufficient for header
438
- raise ValueError("Network file is not supported right now.")
439
- # TODO replace requests to build urllib
440
- # headers = {'range': 'bytes=0-100000', 'x-amz-meta-requester': 'straw'}
441
- # s = requests.Session()
442
- # r = s.get(infile, headers=headers)
443
- # if (r.status_code >= 400):
444
- # print("Error accessing " + infile)
445
- # print("HTTP status code " + str(r.status_code))
446
- # return -1
447
- # req = io.BytesIO(r.content)
448
- # myrange = r.headers['content-range'].split('/')
449
- # totalbytes = myrange[1]
450
- else:
451
- req = open(infile, 'rb')
452
-
453
- if not norm in ["NONE", "VC", "VC_SQRT", "KR"]:
454
- print(
455
- "Norm specified incorrectly, must be one of <NONE/VC/VC_SQRT/KR>\nUsage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>\n")
456
- return -1
457
- if not unit in ["BP", "FRAG"]:
458
- print(
459
- "Unit specified incorrectly, must be one of <BP/FRAG>\nUsage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>\n")
460
- return -1
461
- c1pos1 = -100
462
- c1pos2 = -100
463
- c2pos1 = -100
464
- c2pos2 = -100
465
- chr1_arra = chr1loc.split(":")
466
- chr2_arra = chr2loc.split(":")
467
- chr1 = chr1_arra[0]
468
- chr2 = chr2_arra[0]
469
- if (len(chr1_arra) == 3):
470
- c1pos1 = chr1_arra[1]
471
- c1pos2 = chr1_arra[2]
472
- if (len(chr2_arra) == 3):
473
- c2pos1 = chr2_arra[1]
474
- c2pos2 = chr2_arra[2]
475
-
476
- list1 = readHeader(req, chr1, chr2, [c1pos1, c1pos2, c2pos1, c2pos2])
477
-
478
- master = list1[0]
479
- chr1ind = list1[1]
480
- chr2ind = list1[2]
481
- c1pos1 = int(list1[3])
482
- c1pos2 = int(list1[4])
483
- c2pos1 = int(list1[5])
484
- c2pos2 = int(list1[6])
485
- c1 = min(chr1ind, chr2ind)
486
- c2 = max(chr1ind, chr2ind)
487
- origRegionIndices = []
488
- regionIndices = []
489
- if (chr1ind > chr2ind):
490
- origRegionIndices.append(c2pos1)
491
- origRegionIndices.append(c2pos2)
492
- origRegionIndices.append(c1pos1)
493
- origRegionIndices.append(c1pos2)
494
- regionIndices.append(int(c2pos1 / binsize))
495
- regionIndices.append(int(c2pos2 / binsize))
496
- regionIndices.append(int(c1pos1 / binsize))
497
- regionIndices.append(int(c1pos2 / binsize))
498
- else:
499
- origRegionIndices.append(c1pos1)
500
- origRegionIndices.append(c1pos2)
501
- origRegionIndices.append(c2pos1)
502
- origRegionIndices.append(c2pos2)
503
- regionIndices.append(int(c1pos1 / binsize))
504
- regionIndices.append(int(c1pos2 / binsize))
505
- regionIndices.append(int(c2pos1 / binsize))
506
- regionIndices.append(int(c2pos2 / binsize))
507
-
508
- # Get footer: from master to end of file
509
- if (infile.startswith("http")):
510
- headers = {'range': 'bytes={0}-{1}'.format(master, totalbytes), 'x-amz-meta-requester': 'straw'}
511
- # print("Requesting {} bytes".format(int(totalbytes)-master))
512
- r = s.get(infile, headers=headers);
513
- # print("Received {} bytes".format(r.headers['Content-Length']))
514
- req = io.BytesIO(r.content)
515
- else:
516
- req.seek(master)
517
-
518
- list1 = readFooter(req, c1, c2, norm, unit, binsize)
519
- myFilePos = list1[0]
520
- c1NormEntry = list1[1]
521
- c2NormEntry = list1[2]
522
-
523
- if (norm != "NONE"):
524
- if (infile.startswith("http")):
525
- endrange = 'bytes={0}-{1}'.format(c1NormEntry['position'], c1NormEntry['position'] + c1NormEntry['size'])
526
- headers = {'range': endrange, 'x-amz-meta-requester': 'straw'}
527
- r = s.get(infile, headers=headers);
528
- req = io.BytesIO(r.content);
529
- c1Norm = readNormalizationVector(req)
530
-
531
- endrange = 'bytes={0}-{1}'.format(c2NormEntry['position'], c2NormEntry['position'] + c2NormEntry['size'])
532
- headers = {'range': endrange, 'x-amz-meta-requester': 'straw'}
533
- r = s.get(infile, headers=headers)
534
- req = io.BytesIO(r.content)
535
- else:
536
- req.seek(c1NormEntry['position'])
537
- c1Norm = readNormalizationVector(req)
538
- req.seek(c2NormEntry['position'])
539
- c2Norm = readNormalizationVector(req)
540
- if (infile.startswith("http")):
541
- headers = {'range': 'bytes={0}-'.format(myFilePos), 'x-amz-meta-requester': 'straw'}
542
- r = s.get(infile, headers=headers, stream=True)
543
- list1 = readMatrix(r.raw, unit, binsize)
544
- else:
545
- req.seek(myFilePos)
546
- list1 = readMatrix(req, unit, binsize)
547
-
548
- blockBinCount = list1[0]
549
- blockColumnCount = list1[1]
550
- blockNumbers = getBlockNumbersForRegionFromBinPosition(regionIndices, blockBinCount, blockColumnCount, c1 == c2)
551
- yActual = []
552
- xActual = []
553
- counts = []
554
-
555
- for i_set in (blockNumbers):
556
- idx = {}
557
- if (i_set in blockMap):
558
- idx = blockMap[i_set]
559
- else:
560
- idx['size'] = 0
561
- idx['position'] = 0
562
- if (idx['size'] == 0):
563
- records = []
609
+ def getHttpHeader(endrange, is_synapse):
610
+ if is_synapse:
611
+ return {'range': endrange}
612
+ return {'range': endrange, 'x-amz-meta-requester': 'straw'}
613
+
614
+
615
+ def readLocalNorm(infile, position):
616
+ req = open(infile, 'rb')
617
+ req.seek(position)
618
+ return readNormalizationVector(req)
619
+
620
+
621
+ def readHttpNorm(infile, normEntry, is_synapse):
622
+ endrange = 'bytes={0}-{1}'.format(normEntry['position'], normEntry['position'] + normEntry['size'])
623
+ headers = getHttpHeader(endrange, is_synapse)
624
+ s = requests.Session()
625
+ r = s.get(infile, headers=headers);
626
+ req = io.BytesIO(r.content);
627
+ return readNormalizationVector(req)
628
+
629
+
630
+ class straw:
631
+ def __init__(self, infile, is_synapse=False):
632
+ """ This is the main workhorse method of the module. Reads a .hic file and
633
+ extracts the given contact matrix. Stores in an array in sparse upper
634
+ triangular format: row, column, (normalized) count
635
+
636
+ Args:
637
+ norm(str): Normalization type, one of VC, KR, VC_SQRT, or NONE
638
+ infile(str): File name or URL of .hic file
639
+ chr1loc(str): Chromosome name and (optionally) range, i.e. "1" or "1:10000:25000"
640
+ chr2loc(str): Chromosome name and (optionally) range, i.e. "1" or "1:10000:25000"
641
+ unit(str): One of BP or FRAG
642
+ binsize(int): Resolution, i.e. 25000 for 25K
643
+ """
644
+
645
+ self.isHttpFile = infile.startswith("http")
646
+ self.infile = infile
647
+ self.is_synapse = is_synapse
648
+ self.master, self.version, totalbytes, self.chromDotSizes = readHeader(infile, is_synapse)
649
+ self.myFilePositions, self.normMap = readFooter(infile, is_synapse, self.master, totalbytes)
650
+
651
+ def getNormalizedMatrix(self, chr1, chr2, norm, unit, binsize):
652
+
653
+ if not (unit == "BP" or unit == "FRAG"):
654
+ print(
655
+ "Unit specified incorrectly, must be one of <BP/FRAG>\nUsage: straw <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize>\n")
656
+ return None
657
+
658
+ for chrom in [chr1, chr2]:
659
+ if chrom not in self.chromDotSizes.data:
660
+ print(str(chrom) + " wasn't found in the file. Check that the chromosome name matches the genome.\n")
661
+ return None
662
+
663
+ chrIndex1 = self.chromDotSizes.getIndex(chr1)
664
+ chrIndex2 = self.chromDotSizes.getIndex(chr2)
665
+ isIntra = chrIndex1 == chrIndex2
666
+
667
+ neededToFlipIndices = False
668
+ if chrIndex1 > chrIndex2:
669
+ neededToFlipIndices = True
670
+ chrIndex1, chrIndex2 = chrIndex2, chrIndex1
671
+ chr1, chr2 = chr2, chr1
672
+
673
+ executor = concurrent.futures.ThreadPoolExecutor()
674
+ if norm != "NONE":
675
+ try:
676
+ c1NormEntry = self.normMap[norm][chrIndex1][unit][binsize]
677
+ except:
678
+ print(
679
+ "File did not contain {0} norm vectors for chr {1} at {2} {3}\n".format(norm, chr1, binsize, unit))
680
+ return None
681
+
682
+ if not isIntra:
683
+ try:
684
+ c2NormEntry = self.normMap[norm][chrIndex2][unit][binsize]
685
+ except:
686
+ print("File did not contain {0} norm vectors for chr {1} at {2} {3}\n".format(norm, chr2, binsize,
687
+ unit))
688
+ return None
689
+ if self.isHttpFile:
690
+ futureNorm1 = executor.submit(readHttpNorm, self.infile, c1NormEntry, self.is_synapse)
691
+ if not isIntra:
692
+ futureNorm2 = executor.submit(readHttpNorm, self.infile, c2NormEntry, self.is_synapse)
693
+ else:
694
+ futureNorm1 = executor.submit(readLocalNorm, self.infile, c1NormEntry['position'])
695
+ if not isIntra:
696
+ futureNorm2 = executor.submit(readLocalNorm, self.infile, c2NormEntry['position'])
697
+
698
+ blockMap = dict()
699
+ key = str(chrIndex1) + "_" + str(chrIndex2)
700
+ if key not in self.myFilePositions:
701
+ print("File doesn't have the given {0} map\n".format(key))
702
+ return None
703
+ myFilePos = self.myFilePositions[key][0]
704
+ if self.isHttpFile:
705
+ headers = getHttpHeader('bytes={0}-'.format(myFilePos), self.is_synapse)
706
+ s = requests.Session()
707
+ r = s.get(self.infile, headers=headers, stream=True)
708
+ futureMatrix = executor.submit(readMatrix, r.raw, unit, binsize, blockMap)
564
709
  else:
565
- if (infile.startswith("http")):
566
- endrange = 'bytes={0}-{1}'.format(idx['position'], idx['position'] + idx['size'])
567
- headers = {'range': endrange, 'x-amz-meta-requester': 'straw'}
568
- r = s.get(infile, headers=headers);
569
- req = io.BytesIO(r.content);
710
+ req = open(self.infile, 'rb')
711
+ req.seek(myFilePos)
712
+ futureMatrix = executor.submit(readMatrix, req, unit, binsize, blockMap)
713
+
714
+ if norm != "NONE":
715
+ c1Norm = futureNorm1.result()
716
+ if isIntra:
717
+ c2Norm = c1Norm
570
718
  else:
571
- req.seek(idx['position'])
572
- records = readBlock(req, idx['size'])
573
-
574
- for j in range(len(records)):
575
- rec = records[j]
576
- x = rec['binX'] * binsize
577
- y = rec['binY'] * binsize
578
- c = rec['counts']
579
- if (norm != "NONE"):
580
- a = c1Norm[rec['binX']] * c2Norm[rec['binY']]
581
- c = (c / (c1Norm[rec['binX']] * c2Norm[rec['binY']])) if (a != 0.0) else "inf"
582
- if ((x >= origRegionIndices[0] and x <= origRegionIndices[1] and y >= origRegionIndices[2] and y <=
583
- origRegionIndices[3]) or (
584
- (c1 == c2) and y >= origRegionIndices[0] and y <= origRegionIndices[1] and x >= origRegionIndices[
585
- 2] and x <= origRegionIndices[3])):
586
- xActual.append(x)
587
- yActual.append(y)
588
- counts.append(c)
589
- return [xActual, yActual, counts]
719
+ c2Norm = futureNorm2.result()
720
+ else:
721
+ c1Norm, c2Norm = None, None
722
+
723
+ blockBinCount, blockColumnCount = futureMatrix.result()
724
+ return normalizedmatrix(self.infile, self.is_synapse, binsize, isIntra, neededToFlipIndices, blockBinCount,
725
+ blockColumnCount, blockMap, norm, c1Norm, c2Norm, self.version)
726
+
727
+
728
+ class normalizedmatrix:
729
+ def __init__(self, infile, is_synapse, binsize, isIntra, neededToFlipIndices, blockBinCount, blockColumnCount,
730
+ blockMap, norm, c1Norm, c2Norm, version):
731
+ self.infile = infile
732
+ self.is_synapse = is_synapse
733
+ self.isHttpFile = infile.startswith("http")
734
+ self.binsize = binsize
735
+ self.isIntra = isIntra
736
+ self.neededToFlipIndices = neededToFlipIndices
737
+ self.blockBinCount = blockBinCount
738
+ self.blockColumnCount = blockColumnCount
739
+ self.norm = norm
740
+ self.c1Norm = c1Norm
741
+ self.c2Norm = c2Norm
742
+ self.blockMap = blockMap
743
+ self.version = version
744
+
745
+ def getDataFromBinRegion(self, X1, X2, Y1, Y2):
746
+ binsize = self.binsize
747
+ if self.neededToFlipIndices:
748
+ X1, X2, Y1, Y2 = Y1, Y2, X1, X2
749
+ binPositionsBox = []
750
+ binPositionsBox.append(int(X1))
751
+ binPositionsBox.append(int(X2))
752
+ binPositionsBox.append(int(Y1))
753
+ binPositionsBox.append(int(Y2))
754
+
755
+ blockNumbers = getBlockNumbersForRegionFromBinPosition(binPositionsBox, self.blockBinCount,
756
+ self.blockColumnCount, self.isIntra)
757
+ yActual = []
758
+ xActual = []
759
+ counts = []
760
+
761
+ executor = concurrent.futures.ProcessPoolExecutor()
762
+ futures = [
763
+ executor.submit(readBlockWorker, self.infile, self.is_synapse, bNum, binsize, self.blockMap, self.norm, \
764
+ self.c1Norm, self.c2Norm, binPositionsBox, self.isIntra, self.version) for bNum in
765
+ blockNumbers]
766
+
767
+ for future in futures:
768
+ xTemp, yTemp, cTemp = future.result()
769
+ xActual.extend(xTemp)
770
+ yActual.extend(yTemp)
771
+ counts.extend(cTemp)
772
+ return [xActual, yActual, counts]
773
+
774
+ def getDataFromGenomeRegion(self, X1, X2, Y1, Y2):
775
+ binsize = self.binsize
776
+ return self.getDataFromBinRegion(X1 / binsize, math.ceil(X2 / binsize), Y1 / binsize, math.ceil(Y2 / binsize))
777
+
778
+ def getBatchedDataFromGenomeRegion(self, listOfCoordinates):
779
+ executor = concurrent.futures.ThreadPoolExecutor()
780
+ futures = [executor.submit(self.getDataFromGenomeRegion, a, b, c, d) for (a, b, c, d) in listOfCoordinates]
781
+ finalResults = list()
782
+ for future in futures:
783
+ finalResults.append(future.result())
784
+ return finalResults
590
785
 
591
786
 
592
787
  def printme(norm, infile, chr1loc, chr2loc, unit, binsize, outfile):
@@ -602,8 +797,16 @@ def printme(norm, infile, chr1loc, chr2loc, unit, binsize, outfile):
602
797
  binsize(int): Resolution, i.e. 25000 for 25K
603
798
  outfile(str): Name of text file to write to
604
799
  """
605
- with open(outfile, 'w') as f:
606
- result = straw(norm, infile, chr1loc, chr2loc, unit, binsize)
607
- for i in range(len(result[0])):
608
- f.write("{0}\t{1}\t{2}\n".format(result[0][i], result[1][i], result[2][i]))
609
- # print("{0}\t{1}\t{2}".format(result[0][i], result[1][i], result[2][i]))
800
+ f = open(outfile, 'w')
801
+ strawObj = straw(infile)
802
+
803
+ chr1, X1, X2 = strawObj.chromDotSizes.figureOutEndpoints(chr1loc)
804
+ chr2, Y1, Y2 = strawObj.chromDotSizes.figureOutEndpoints(chr2loc)
805
+
806
+ matrxObj = strawObj.getNormalizedMatrix(chr1, chr2, norm, unit, binsize)
807
+
808
+ result = matrxObj.getDataFromGenomeRegion(X1, X2, Y1, Y2)
809
+
810
+ for i in range(len(result[0])):
811
+ f.write("{0}\t{1}\t{2}\n".format(result[0][i], result[1][i], result[2][i]))
812
+ f.close()