seqchromloader 0.7.2__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/PKG-INFO +1 -1
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader/loader.py +54 -41
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader/writer.py +1 -1
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader.egg-info/PKG-INFO +1 -1
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/setup.py +1 -1
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/README.md +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader/__init__.py +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader/utils.py +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader.egg-info/SOURCES.txt +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader.egg-info/dependency_links.txt +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader.egg-info/requires.txt +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/seqchromloader.egg-info/top_level.txt +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/setup.cfg +0 -0
- {seqchromloader-0.7.2 → seqchromloader-0.7.4}/tests/test_writer_loader.py +0 -0
|
@@ -4,6 +4,7 @@ description = """
|
|
|
4
4
|
Given bed file, return sequence and chromatin info
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import math
|
|
7
8
|
import logging
|
|
8
9
|
import torch
|
|
9
10
|
import random
|
|
@@ -20,11 +21,6 @@ from pytorch_lightning import LightningDataModule
|
|
|
20
21
|
|
|
21
22
|
from seqchromloader import utils
|
|
22
23
|
|
|
23
|
-
def worker_init_fn(worker_id):
|
|
24
|
-
worker_info = torch.utils.data.get_worker_info()
|
|
25
|
-
dataset = worker_info.dataset
|
|
26
|
-
dataset.initialize()
|
|
27
|
-
|
|
28
24
|
class SeqChromLoader():
|
|
29
25
|
"""
|
|
30
26
|
:param dataloader_kws: keyword arguments passed to ``torch.utils.data.DataLoader``
|
|
@@ -37,14 +33,12 @@ class SeqChromLoader():
|
|
|
37
33
|
def __call__(self, *args, dataloader_kws:dict={}, **kwargs):
|
|
38
34
|
# default dataloader kws
|
|
39
35
|
if dataloader_kws is not None:
|
|
40
|
-
wif = dataloader_kws.pop("worker_init_fn", worker_init_fn)
|
|
41
36
|
num_workers = dataloader_kws.pop("num_workers", 1)
|
|
42
37
|
else:
|
|
43
|
-
wif = worker_init_fn
|
|
44
38
|
num_workers = 1
|
|
45
39
|
|
|
46
40
|
return DataLoader(self.SeqChromDataset(*args, **kwargs),
|
|
47
|
-
|
|
41
|
+
num_workers=num_workers, **dataloader_kws)
|
|
48
42
|
|
|
49
43
|
def seqChromLoaderCurry(SeqChromDataset):
|
|
50
44
|
|
|
@@ -97,7 +91,7 @@ class _SeqChromDatasetByWds(IterableDataset):
|
|
|
97
91
|
|
|
98
92
|
SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds)
|
|
99
93
|
|
|
100
|
-
class _SeqChromDatasetByDataFrame(
|
|
94
|
+
class _SeqChromDatasetByDataFrame(IterableDataset):
|
|
101
95
|
"""
|
|
102
96
|
:param dataframe: pandas dataframe describing genomics regions to extract info from, every region has to be of the same length.
|
|
103
97
|
:type dataframe: pd.DataFrame
|
|
@@ -116,8 +110,8 @@ class _SeqChromDatasetByDataFrame(Dataset):
|
|
|
116
110
|
bigwig_filelist:list,
|
|
117
111
|
target_bam=None,
|
|
118
112
|
transforms:dict=None,
|
|
119
|
-
|
|
120
|
-
|
|
113
|
+
return_region=False,
|
|
114
|
+
patch_left=0, patch_right=0):
|
|
121
115
|
|
|
122
116
|
self.dataframe = dataframe
|
|
123
117
|
self.genome_fasta = genome_fasta
|
|
@@ -128,10 +122,11 @@ class _SeqChromDatasetByDataFrame(Dataset):
|
|
|
128
122
|
self.target_pysam = None
|
|
129
123
|
|
|
130
124
|
self.transforms = transforms
|
|
131
|
-
|
|
132
|
-
if initialize_first: self.initialize()
|
|
133
|
-
|
|
134
125
|
self.return_region = return_region
|
|
126
|
+
self.patch_left = patch_left
|
|
127
|
+
self.patch_right = patch_right
|
|
128
|
+
|
|
129
|
+
self.start = 0; self.end = len(self.dataframe)
|
|
135
130
|
|
|
136
131
|
def initialize(self):
|
|
137
132
|
# create the stream handler after child processes spawned to enable parallel reading
|
|
@@ -141,31 +136,47 @@ class _SeqChromDatasetByDataFrame(Dataset):
|
|
|
141
136
|
if self.target_bam is not None:
|
|
142
137
|
self.target_pysam = pysam.AlignmentFile(self.target_bam)
|
|
143
138
|
|
|
144
|
-
def
|
|
145
|
-
|
|
139
|
+
def __iter__(self):
|
|
140
|
+
self.initialize()
|
|
141
|
+
worker_info = torch.utils.data.get_worker_info()
|
|
142
|
+
if worker_info is not None: # single-process data loading, return the full iterator
|
|
143
|
+
# split workload
|
|
144
|
+
per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
|
|
145
|
+
worker_id = worker_info.id
|
|
146
|
+
iter_start = self.start + worker_id * per_worker
|
|
147
|
+
iter_end = min(iter_start + per_worker, self.end)
|
|
148
|
+
# replace start and end
|
|
149
|
+
self.start = iter_start; self.end = iter_end
|
|
150
|
+
|
|
151
|
+
for idx in range(self.start, self.end):
|
|
152
|
+
item = self.dataframe.iloc[idx,]
|
|
153
|
+
try:
|
|
154
|
+
feature = utils.extract_info(
|
|
155
|
+
item.chrom,
|
|
156
|
+
item.start,
|
|
157
|
+
item.end,
|
|
158
|
+
item.label,
|
|
159
|
+
genome_pyfaidx=self.genome_pyfaidx,
|
|
160
|
+
bigwigs=self.bigwigs,
|
|
161
|
+
target=self.target_pysam,
|
|
162
|
+
strand=item.strand,
|
|
163
|
+
transforms=self.transforms,
|
|
164
|
+
patch_left=self.patch_left,
|
|
165
|
+
patch_right=self.patch_right
|
|
166
|
+
)
|
|
167
|
+
except utils.BigWigInaccessible as e:
|
|
168
|
+
logging.warn(f"Inaccessible bigwig error detected in region {item.chrom}:{item.start}-{item.end}, Skipping...")
|
|
169
|
+
continue
|
|
170
|
+
except AssertionError as e:
|
|
171
|
+
logging.warn(f"AssertionError detected in region {item.chrom}:{item.start}-{item.end}, Skipping")
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
if not self.return_region:
|
|
175
|
+
yield feature['seq'], feature['chrom'], feature['target'], feature['label']
|
|
176
|
+
else:
|
|
177
|
+
yield f'{item.chrom}:{item.start}-{item.end}', feature['seq'], feature['chrom'], feature['target'], feature['label']
|
|
178
|
+
|
|
146
179
|
|
|
147
|
-
def __getitem__(self, idx):
|
|
148
|
-
item = self.dataframe.iloc[idx,]
|
|
149
|
-
try:
|
|
150
|
-
feature = utils.extract_info(
|
|
151
|
-
item.chrom,
|
|
152
|
-
item.start,
|
|
153
|
-
item.end,
|
|
154
|
-
item.label,
|
|
155
|
-
genome_pyfaidx=self.genome_pyfaidx,
|
|
156
|
-
bigwigs=self.bigwigs,
|
|
157
|
-
target=self.target_pysam,
|
|
158
|
-
strand=item.strand,
|
|
159
|
-
transforms=self.transforms
|
|
160
|
-
)
|
|
161
|
-
except utils.BigWigInaccessible as e:
|
|
162
|
-
raise e
|
|
163
|
-
|
|
164
|
-
if not self.return_region:
|
|
165
|
-
return feature['seq'], feature['chrom'], feature['target'], feature['label']
|
|
166
|
-
else:
|
|
167
|
-
return f'{item.chrom}:{item.start}-{item.end}', feature['seq'], feature['chrom'], feature['target'], feature['label']
|
|
168
|
-
|
|
169
180
|
SeqChromDatasetByDataFrame = seqChromLoaderCurry(_SeqChromDatasetByDataFrame)
|
|
170
181
|
|
|
171
182
|
class _SeqChromDatasetByBed(_SeqChromDatasetByDataFrame):
|
|
@@ -181,15 +192,17 @@ class _SeqChromDatasetByBed(_SeqChromDatasetByDataFrame):
|
|
|
181
192
|
:param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
|
|
182
193
|
:type transforms: dict of functions
|
|
183
194
|
"""
|
|
184
|
-
def __init__(self, bed: str, genome_fasta: str, bigwig_filelist:list, target_bam=None,
|
|
195
|
+
def __init__(self, bed: str, genome_fasta: str, bigwig_filelist:list, target_bam=None,
|
|
196
|
+
transforms:dict=None, return_region=False,
|
|
197
|
+
patch_left=0, patch_right=0):
|
|
185
198
|
dataframe = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ])
|
|
186
199
|
super().__init__(dataframe,
|
|
187
200
|
genome_fasta,
|
|
188
201
|
bigwig_filelist,
|
|
189
202
|
target_bam,
|
|
190
203
|
transforms,
|
|
191
|
-
|
|
192
|
-
|
|
204
|
+
return_region,
|
|
205
|
+
patch_left, patch_right)
|
|
193
206
|
|
|
194
207
|
SeqChromDatasetByBed = seqChromLoaderCurry(_SeqChromDatasetByBed)
|
|
195
208
|
|
|
@@ -186,7 +186,7 @@ def dump_data_webdataset_worker(coords,
|
|
|
186
186
|
|
|
187
187
|
if batch_size is None:
|
|
188
188
|
feature_dict = defaultdict()
|
|
189
|
-
feature_dict["__key__"] = f"{rindex}_{item.chrom}:{item.start}-{item.end}_{item.strand}"
|
|
189
|
+
feature_dict["__key__"] = f"{rindex}_{item.chrom}:{item.start-patch_left}-{item.end+patch_right}_{item.strand}"
|
|
190
190
|
feature_dict["seq.npy"] = feature['seq']
|
|
191
191
|
feature_dict["chrom.npy"] = feature['chrom']
|
|
192
192
|
feature_dict["target.npy"] = feature['target']
|
|
@@ -20,7 +20,7 @@ setup(
|
|
|
20
20
|
# eg: 1.0.0, 1.0.1, 3.0.2, 5.0-beta, etc.
|
|
21
21
|
# You CANNOT upload two versions of your package with the same version number
|
|
22
22
|
# This field is REQUIRED
|
|
23
|
-
version="0.7.
|
|
23
|
+
version="0.7.4",
|
|
24
24
|
|
|
25
25
|
# The packages that constitute your project.
|
|
26
26
|
# For my project, I have only one - "pydash".
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|