seqchromloader 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/PKG-INFO +1 -1
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader/__init__.py +1 -1
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader/loader.py +6 -2
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader/writer.py +54 -10
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader.egg-info/PKG-INFO +1 -1
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/setup.py +1 -1
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/README.md +0 -0
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader/utils.py +0 -0
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader.egg-info/SOURCES.txt +0 -0
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader.egg-info/dependency_links.txt +0 -0
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader.egg-info/requires.txt +0 -0
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/seqchromloader.egg-info/top_level.txt +0 -0
- {seqchromloader-0.3.0 → seqchromloader-0.4.0}/setup.cfg +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
from .loader import SeqChromDatasetByDataFrame, SeqChromDatasetByBed, SeqChromDatasetByWds, SeqChromDataModule
|
|
2
|
-
from .writer import dump_data_webdataset
|
|
2
|
+
from .writer import dump_data_webdataset, convert_data_webdataset
|
|
@@ -57,12 +57,13 @@ class _SeqChromDatasetByWds(IterableDataset):
|
|
|
57
57
|
:param transforms: A dictionary of functions to transform the output data, accepted keys are **["seq", "chrom", "target", "label"]**
|
|
58
58
|
:type transforms: dict of functions
|
|
59
59
|
"""
|
|
60
|
-
def __init__(self, wds, transforms:dict=None, rank=0, world_size=1):
|
|
60
|
+
def __init__(self, wds, transforms:dict=None, rank=0, world_size=1, keep_key=False):
|
|
61
61
|
self.wds = wds
|
|
62
62
|
self.transforms = transforms
|
|
63
63
|
|
|
64
64
|
self.rank = rank
|
|
65
65
|
self.world_size = world_size
|
|
66
|
+
self.keep_key = keep_key
|
|
66
67
|
|
|
67
68
|
def initialize(self):
|
|
68
69
|
# this function will be called by worker_init_function in DataLoader
|
|
@@ -85,7 +86,10 @@ class _SeqChromDatasetByWds(IterableDataset):
|
|
|
85
86
|
if self.transforms is not None:
|
|
86
87
|
pipeline.append(wds.map_dict(**self.transforms))
|
|
87
88
|
|
|
88
|
-
|
|
89
|
+
if self.keep_key:
|
|
90
|
+
pipeline.append(wds.to_tuple("__key__", "seq", "chrom", "target", "label"))
|
|
91
|
+
else:
|
|
92
|
+
pipeline.append(wds.to_tuple("seq", "chrom", "target", "label"))
|
|
89
93
|
|
|
90
94
|
ds = wds.DataPipeline(*pipeline)
|
|
91
95
|
|
|
@@ -17,14 +17,43 @@ import pysam
|
|
|
17
17
|
import pyBigWig
|
|
18
18
|
import webdataset as wds
|
|
19
19
|
|
|
20
|
-
from
|
|
20
|
+
from . import utils
|
|
21
|
+
from .loader import _SeqChromDatasetByWds
|
|
21
22
|
|
|
23
|
+
def convert_data_webdataset(wds_in, wds_out, transforms=None, compress=False):
|
|
24
|
+
"""
|
|
25
|
+
Transform the provided webdataset
|
|
26
|
+
|
|
27
|
+
:param wds_in: input webdataset file
|
|
28
|
+
:type wds_in: string
|
|
29
|
+
:param wds_out: output webdataset file
|
|
30
|
+
:type wds_out: string
|
|
31
|
+
:param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
|
|
32
|
+
:type transforms: dict of functions
|
|
33
|
+
:param compress: whether to compress the output file
|
|
34
|
+
:type compress: boolean
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
ds = _SeqChromDatasetByWds(wds_in, transforms=transforms, keep_key=True)
|
|
38
|
+
sink = wds.TarWriter(wds_out, compress=compress)
|
|
39
|
+
for (key, seq, chrom, target, label) in ds:
|
|
40
|
+
feature_dict = defaultdict()
|
|
41
|
+
feature_dict["__key__"] = key
|
|
42
|
+
|
|
43
|
+
feature_dict["seq.npy"] = seq
|
|
44
|
+
feature_dict["chrom.npy"] = chrom
|
|
45
|
+
feature_dict["target.npy"] = target
|
|
46
|
+
feature_dict["label.npy"] = label
|
|
47
|
+
sink.write(feature_dict)
|
|
48
|
+
sink.close()
|
|
49
|
+
|
|
22
50
|
def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
|
|
23
51
|
target_bam=None,
|
|
24
52
|
outdir="dataset/", outprefix="seqchrom",
|
|
25
53
|
compress=True,
|
|
26
54
|
numProcessors=1,
|
|
27
|
-
transforms=None
|
|
55
|
+
transforms=None,
|
|
56
|
+
DALI=False):
|
|
28
57
|
"""
|
|
29
58
|
Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format
|
|
30
59
|
|
|
@@ -46,6 +75,8 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
|
|
|
46
75
|
:type compress: boolean
|
|
47
76
|
:param numProcessors: number of processors
|
|
48
77
|
:type numProcessors: int
|
|
78
|
+
:param DALI: Set to True if you want to use the dataset for NVIDIA DALI, it would save all arrays in bytes, which results in losing the array shape info
|
|
79
|
+
:param DALI: boolean
|
|
49
80
|
"""
|
|
50
81
|
|
|
51
82
|
# split coordinates and assign chunks to workers
|
|
@@ -61,10 +92,16 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
|
|
|
61
92
|
target_bam=target_bam,
|
|
62
93
|
compress=compress,
|
|
63
94
|
outdir=outdir,
|
|
64
|
-
transforms=transforms
|
|
95
|
+
transforms=transforms,
|
|
96
|
+
DALI=DALI)
|
|
97
|
+
|
|
98
|
+
count_of_digits = 0
|
|
99
|
+
while num_chunks > 0:
|
|
100
|
+
num_chunks = int(num_chunks/10)
|
|
101
|
+
count_of_digits += 1
|
|
65
102
|
|
|
66
103
|
pool = Pool(numProcessors)
|
|
67
|
-
res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" +
|
|
104
|
+
res = pool.starmap_async(dump_data_worker_freeze, zip(chunks, [outprefix + "_" + format(i, f'0{count_of_digits}d') for i in range(num_chunks)]))
|
|
68
105
|
files = res.get()
|
|
69
106
|
|
|
70
107
|
return files
|
|
@@ -76,7 +113,8 @@ def dump_data_webdataset_worker(coords,
|
|
|
76
113
|
target_bam=None,
|
|
77
114
|
outdir="dataset/",
|
|
78
115
|
compress=True,
|
|
79
|
-
transforms=None
|
|
116
|
+
transforms=None,
|
|
117
|
+
DALI=False):
|
|
80
118
|
# get handlers
|
|
81
119
|
genome_pyfasta = pyfasta.Fasta(fasta)
|
|
82
120
|
bigwigs = [pyBigWig.open(bw) for bw in bigwig_files]
|
|
@@ -103,11 +141,17 @@ def dump_data_webdataset_worker(coords,
|
|
|
103
141
|
)
|
|
104
142
|
except utils.BigWigInaccessible as e:
|
|
105
143
|
continue
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
144
|
+
|
|
145
|
+
if not DALI:
|
|
146
|
+
feature_dict["seq.npy"] = feature['seq']
|
|
147
|
+
feature_dict["chrom.npy"] = feature['chrom']
|
|
148
|
+
feature_dict["target.npy"] = feature['target']
|
|
149
|
+
feature_dict["label.npy"] = feature['label']
|
|
150
|
+
else:
|
|
151
|
+
feature_dict["seq.npy"] = feature['seq'].tobytes()
|
|
152
|
+
feature_dict["chrom.npy"] = feature['chrom'].tobytes()
|
|
153
|
+
feature_dict["target.npy"] = feature['target'].tobytes()
|
|
154
|
+
feature_dict["label.npy"] = feature['label'].tobytes()
|
|
111
155
|
|
|
112
156
|
sink.write(feature_dict)
|
|
113
157
|
|
|
@@ -20,7 +20,7 @@ setup(
|
|
|
20
20
|
# eg: 1.0.0, 1.0.1, 3.0.2, 5.0-beta, etc.
|
|
21
21
|
# You CANNOT upload two versions of your package with the same version number
|
|
22
22
|
# This field is REQUIRED
|
|
23
|
-
version="0.
|
|
23
|
+
version="0.4.0",
|
|
24
24
|
|
|
25
25
|
# The packages that constitute your project.
|
|
26
26
|
# For my project, I have only one - "pydash".
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|