seqchromloader 0.7.2__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: seqchromloader
3
- Version: 0.7.2
3
+ Version: 0.7.4
4
4
  Summary: Sequence and chromatin dataloader for deep learning
5
5
  Home-page: https://github.com/yztxwd/seqchromloader
6
6
  Author-email: yztxwd@gmail.com
@@ -4,6 +4,7 @@ description = """
4
4
  Given bed file, return sequence and chromatin info
5
5
  """
6
6
 
7
+ import math
7
8
  import logging
8
9
  import torch
9
10
  import random
@@ -20,11 +21,6 @@ from pytorch_lightning import LightningDataModule
20
21
 
21
22
  from seqchromloader import utils
22
23
 
23
- def worker_init_fn(worker_id):
24
- worker_info = torch.utils.data.get_worker_info()
25
- dataset = worker_info.dataset
26
- dataset.initialize()
27
-
28
24
  class SeqChromLoader():
29
25
  """
30
26
  :param dataloader_kws: keyword arguments passed to ``torch.utils.data.DataLoader``
@@ -37,14 +33,12 @@ class SeqChromLoader():
37
33
  def __call__(self, *args, dataloader_kws:dict={}, **kwargs):
38
34
  # default dataloader kws
39
35
  if dataloader_kws is not None:
40
- wif = dataloader_kws.pop("worker_init_fn", worker_init_fn)
41
36
  num_workers = dataloader_kws.pop("num_workers", 1)
42
37
  else:
43
- wif = worker_init_fn
44
38
  num_workers = 1
45
39
 
46
40
  return DataLoader(self.SeqChromDataset(*args, **kwargs),
47
- worker_init_fn=wif, num_workers=num_workers, **dataloader_kws)
41
+ num_workers=num_workers, **dataloader_kws)
48
42
 
49
43
  def seqChromLoaderCurry(SeqChromDataset):
50
44
 
@@ -97,7 +91,7 @@ class _SeqChromDatasetByWds(IterableDataset):
97
91
 
98
92
  SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds)
99
93
 
100
- class _SeqChromDatasetByDataFrame(Dataset):
94
+ class _SeqChromDatasetByDataFrame(IterableDataset):
101
95
  """
102
96
  :param dataframe: pandas dataframe describing genomics regions to extract info from, every region has to be of the same length.
103
97
  :type dataframe: pd.DataFrame
@@ -116,8 +110,8 @@ class _SeqChromDatasetByDataFrame(Dataset):
116
110
  bigwig_filelist:list,
117
111
  target_bam=None,
118
112
  transforms:dict=None,
119
- initialize_first=False,
120
- return_region=False):
113
+ return_region=False,
114
+ patch_left=0, patch_right=0):
121
115
 
122
116
  self.dataframe = dataframe
123
117
  self.genome_fasta = genome_fasta
@@ -128,10 +122,11 @@ class _SeqChromDatasetByDataFrame(Dataset):
128
122
  self.target_pysam = None
129
123
 
130
124
  self.transforms = transforms
131
-
132
- if initialize_first: self.initialize()
133
-
134
125
  self.return_region = return_region
126
+ self.patch_left = patch_left
127
+ self.patch_right = patch_right
128
+
129
+ self.start = 0; self.end = len(self.dataframe)
135
130
 
136
131
  def initialize(self):
137
132
  # create the stream handler after child processes spawned to enable parallel reading
@@ -141,31 +136,47 @@ class _SeqChromDatasetByDataFrame(Dataset):
141
136
  if self.target_bam is not None:
142
137
  self.target_pysam = pysam.AlignmentFile(self.target_bam)
143
138
 
144
- def __len__(self):
145
- return len(self.dataframe)
139
+ def __iter__(self):
140
+ self.initialize()
141
+ worker_info = torch.utils.data.get_worker_info()
142
+ if worker_info is not None: # single-process data loading, return the full iterator
143
+ # split workload
144
+ per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
145
+ worker_id = worker_info.id
146
+ iter_start = self.start + worker_id * per_worker
147
+ iter_end = min(iter_start + per_worker, self.end)
148
+ # replace start and end
149
+ self.start = iter_start; self.end = iter_end
150
+
151
+ for idx in range(self.start, self.end):
152
+ item = self.dataframe.iloc[idx,]
153
+ try:
154
+ feature = utils.extract_info(
155
+ item.chrom,
156
+ item.start,
157
+ item.end,
158
+ item.label,
159
+ genome_pyfaidx=self.genome_pyfaidx,
160
+ bigwigs=self.bigwigs,
161
+ target=self.target_pysam,
162
+ strand=item.strand,
163
+ transforms=self.transforms,
164
+ patch_left=self.patch_left,
165
+ patch_right=self.patch_right
166
+ )
167
+ except utils.BigWigInaccessible as e:
168
+ logging.warn(f"Inaccessible bigwig error detected in region {item.chrom}:{item.start}-{item.end}, Skipping...")
169
+ continue
170
+ except AssertionError as e:
171
+ logging.warn(f"AssertionError detected in region {item.chrom}:{item.start}-{item.end}, Skipping")
172
+ continue
173
+
174
+ if not self.return_region:
175
+ yield feature['seq'], feature['chrom'], feature['target'], feature['label']
176
+ else:
177
+ yield f'{item.chrom}:{item.start}-{item.end}', feature['seq'], feature['chrom'], feature['target'], feature['label']
178
+
146
179
 
147
- def __getitem__(self, idx):
148
- item = self.dataframe.iloc[idx,]
149
- try:
150
- feature = utils.extract_info(
151
- item.chrom,
152
- item.start,
153
- item.end,
154
- item.label,
155
- genome_pyfaidx=self.genome_pyfaidx,
156
- bigwigs=self.bigwigs,
157
- target=self.target_pysam,
158
- strand=item.strand,
159
- transforms=self.transforms
160
- )
161
- except utils.BigWigInaccessible as e:
162
- raise e
163
-
164
- if not self.return_region:
165
- return feature['seq'], feature['chrom'], feature['target'], feature['label']
166
- else:
167
- return f'{item.chrom}:{item.start}-{item.end}', feature['seq'], feature['chrom'], feature['target'], feature['label']
168
-
169
180
  SeqChromDatasetByDataFrame = seqChromLoaderCurry(_SeqChromDatasetByDataFrame)
170
181
 
171
182
  class _SeqChromDatasetByBed(_SeqChromDatasetByDataFrame):
@@ -181,15 +192,17 @@ class _SeqChromDatasetByBed(_SeqChromDatasetByDataFrame):
181
192
  :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
182
193
  :type transforms: dict of functions
183
194
  """
184
- def __init__(self, bed: str, genome_fasta: str, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False, return_region=False):
195
+ def __init__(self, bed: str, genome_fasta: str, bigwig_filelist:list, target_bam=None,
196
+ transforms:dict=None, return_region=False,
197
+ patch_left=0, patch_right=0):
185
198
  dataframe = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ])
186
199
  super().__init__(dataframe,
187
200
  genome_fasta,
188
201
  bigwig_filelist,
189
202
  target_bam,
190
203
  transforms,
191
- initialize_first,
192
- return_region)
204
+ return_region,
205
+ patch_left, patch_right)
193
206
 
194
207
  SeqChromDatasetByBed = seqChromLoaderCurry(_SeqChromDatasetByBed)
195
208
 
@@ -186,7 +186,7 @@ def dump_data_webdataset_worker(coords,
186
186
 
187
187
  if batch_size is None:
188
188
  feature_dict = defaultdict()
189
- feature_dict["__key__"] = f"{rindex}_{item.chrom}:{item.start}-{item.end}_{item.strand}"
189
+ feature_dict["__key__"] = f"{rindex}_{item.chrom}:{item.start-patch_left}-{item.end+patch_right}_{item.strand}"
190
190
  feature_dict["seq.npy"] = feature['seq']
191
191
  feature_dict["chrom.npy"] = feature['chrom']
192
192
  feature_dict["target.npy"] = feature['target']
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: seqchromloader
3
- Version: 0.7.2
3
+ Version: 0.7.4
4
4
  Summary: Sequence and chromatin dataloader for deep learning
5
5
  Home-page: https://github.com/yztxwd/seqchromloader
6
6
  Author-email: yztxwd@gmail.com
@@ -20,7 +20,7 @@ setup(
20
20
  # eg: 1.0.0, 1.0.1, 3.0.2, 5.0-beta, etc.
21
21
  # You CANNOT upload two versions of your package with the same version number
22
22
  # This field is REQUIRED
23
- version="0.7.2",
23
+ version="0.7.4",
24
24
 
25
25
  # The packages that constitute your project.
26
26
  # For my project, I have only one - "pydash".
File without changes
File without changes