seqchromloader 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: seqchromloader
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Sequence and chromatin dataloader for deep learning
5
5
  Home-page: https://github.com/yztxwd/seqchromloader
6
6
  Author-email: yztxwd@gmail.com
@@ -1,2 +1,2 @@
1
1
  from .loader import SeqChromDatasetByDataFrame, SeqChromDatasetByBed, SeqChromDatasetByWds, SeqChromDataModule
2
- from .writer import dump_data_webdataset
2
+ from .writer import dump_data_webdataset, convert_data_webdataset
@@ -57,12 +57,13 @@ class _SeqChromDatasetByWds(IterableDataset):
57
57
  :param transforms: A dictionary of functions to transform the output data, accepted keys are **["seq", "chrom", "target", "label"]**
58
58
  :type transforms: dict of functions
59
59
  """
60
- def __init__(self, wds, transforms:dict=None, rank=0, world_size=1):
60
+ def __init__(self, wds, transforms:dict=None, rank=0, world_size=1, keep_key=False):
61
61
  self.wds = wds
62
62
  self.transforms = transforms
63
63
 
64
64
  self.rank = rank
65
65
  self.world_size = world_size
66
+ self.keep_key = keep_key
66
67
 
67
68
  def initialize(self):
68
69
  # this function will be called by worker_init_function in DataLoader
@@ -85,7 +86,10 @@ class _SeqChromDatasetByWds(IterableDataset):
85
86
  if self.transforms is not None:
86
87
  pipeline.append(wds.map_dict(**self.transforms))
87
88
 
88
- pipeline.append(wds.to_tuple("seq", "chrom", "target", "label"))
89
+ if self.keep_key:
90
+ pipeline.append(wds.to_tuple("__key__", "seq", "chrom", "target", "label"))
91
+ else:
92
+ pipeline.append(wds.to_tuple("seq", "chrom", "target", "label"))
89
93
 
90
94
  ds = wds.DataPipeline(*pipeline)
91
95
 
@@ -17,14 +17,43 @@ import pysam
17
17
  import pyBigWig
18
18
  import webdataset as wds
19
19
 
20
- from seqchromloader import utils
20
+ from . import utils
21
+ from .loader import _SeqChromDatasetByWds
21
22
 
23
+ def convert_data_webdataset(wds_in, wds_out, transforms=None, compress=False):
24
+ """
25
+ Transform the provided webdataset
26
+
27
+ :param wds_in: input webdataset file
28
+ :type wds_in: string
29
+ :param wds_out: output webdataset file
30
+ :type wds_out: string
31
+ :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
32
+ :type transforms: dict of functions
33
+ :param compress: whether to compress the output file
34
+ :type compress: boolean
35
+ """
36
+
37
+ ds = _SeqChromDatasetByWds(wds_in, transforms=transforms, keep_key=True)
38
+ sink = wds.TarWriter(wds_out, compress=compress)
39
+ for (key, seq, chrom, target, label) in ds:
40
+ feature_dict = defaultdict()
41
+ feature_dict["__key__"] = key
42
+
43
+ feature_dict["seq.npy"] = seq
44
+ feature_dict["chrom.npy"] = chrom
45
+ feature_dict["target.npy"] = target
46
+ feature_dict["label.npy"] = label
47
+ sink.write(feature_dict)
48
+ sink.close()
49
+
22
50
  def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
23
51
  target_bam=None,
24
52
  outdir="dataset/", outprefix="seqchrom",
25
53
  compress=True,
26
54
  numProcessors=1,
27
- transforms=None):
55
+ transforms=None,
56
+ DALI=False):
28
57
  """
29
58
  Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format
30
59
 
@@ -46,6 +75,8 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
46
75
  :type compress: boolean
47
76
  :param numProcessors: number of processors
48
77
  :type numProcessors: int
78
+ :param DALI: Set to True if you want to use the dataset for NVIDIA DALI, it would save all arrays in bytes, which results in losing the array shape info
79
+ :param DALI: boolean
49
80
  """
50
81
 
51
82
  # split coordinates and assign chunks to workers
@@ -61,10 +92,16 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
61
92
  target_bam=target_bam,
62
93
  compress=compress,
63
94
  outdir=outdir,
64
- transforms=transforms)
95
+ transforms=transforms,
96
+ DALI=DALI)
97
+
98
+ count_of_digits = 0
99
+ while num_chunks > 0:
100
+ num_chunks = int(num_chunks/10)
101
+ count_of_digits += 1
65
102
 
66
103
  pool = Pool(numProcessors)
67
- res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" + str(i) for i in range(num_chunks)]))
104
+ res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" + format(i, f'0{count_of_digits}d') for i in range(num_chunks)]))
68
105
  files = res.get()
69
106
 
70
107
  return files
@@ -76,7 +113,8 @@ def dump_data_webdataset_worker(coords,
76
113
  target_bam=None,
77
114
  outdir="dataset/",
78
115
  compress=True,
79
- transforms=None):
116
+ transforms=None,
117
+ DALI=False):
80
118
  # get handlers
81
119
  genome_pyfasta = pyfasta.Fasta(fasta)
82
120
  bigwigs = [pyBigWig.open(bw) for bw in bigwig_files]
@@ -103,11 +141,17 @@ def dump_data_webdataset_worker(coords,
103
141
  )
104
142
  except utils.BigWigInaccessible as e:
105
143
  continue
106
-
107
- feature_dict["seq.npy"] = feature['seq']
108
- feature_dict["chrom.npy"] = feature['chrom']
109
- feature_dict["target.npy"] = feature['target']
110
- feature_dict["label.npy"] = feature['label']
144
+
145
+ if not DALI:
146
+ feature_dict["seq.npy"] = feature['seq']
147
+ feature_dict["chrom.npy"] = feature['chrom']
148
+ feature_dict["target.npy"] = feature['target']
149
+ feature_dict["label.npy"] = feature['label']
150
+ else:
151
+ feature_dict["seq.npy"] = feature['seq'].tobytes()
152
+ feature_dict["chrom.npy"] = feature['chrom'].tobytes()
153
+ feature_dict["target.npy"] = feature['target'].tobytes()
154
+ feature_dict["label.npy"] = feature['label'].tobytes()
111
155
 
112
156
  sink.write(feature_dict)
113
157
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: seqchromloader
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Sequence and chromatin dataloader for deep learning
5
5
  Home-page: https://github.com/yztxwd/seqchromloader
6
6
  Author-email: yztxwd@gmail.com
@@ -20,7 +20,7 @@ setup(
20
20
  # eg: 1.0.0, 1.0.1, 3.0.2, 5.0-beta, etc.
21
21
  # You CANNOT upload two versions of your package with the same version number
22
22
  # This field is REQUIRED
23
- version="0.3.0",
23
+ version="0.4.0",
24
24
 
25
25
  # The packages that constitute your project.
26
26
  # For my project, I have only one - "pydash".
File without changes
File without changes