zipstrain 0.2.8__tar.gz → 0.2.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {zipstrain-0.2.8 → zipstrain-0.2.16}/PKG-INFO +1 -1
- {zipstrain-0.2.8 → zipstrain-0.2.16}/pyproject.toml +1 -1
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/cli.py +75 -15
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/profile.py +54 -6
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/task_manager.py +46 -34
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/utils.py +127 -78
- {zipstrain-0.2.8 → zipstrain-0.2.16}/README.md +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/__init__.py +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/compare.py +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/database.py +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/visualize.py +0 -0
|
@@ -324,43 +324,91 @@ def to_complete_table(genome_comparison_object, output_file):
|
|
|
324
324
|
@click.option('--profile-file', '-p', required=True, help="Path to the profile Parquet file.")
|
|
325
325
|
@click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
|
|
326
326
|
@click.option('--bed-file', '-b', required=True, help="Path to the BED file.")
|
|
327
|
-
@click.option('--
|
|
328
|
-
@click.option('--
|
|
329
|
-
@click.option('--
|
|
327
|
+
@click.option('--read-loc-file', '-r', required=True, help="Path to the read location table.")
|
|
328
|
+
@click.option('--min-cov-fug', '-c', default=0.1, help="Minimum coverage to use fug.")
|
|
329
|
+
@click.option('--fug-threshold', '-f', default=2, help="FUG threshold.")
|
|
330
|
+
@click.option('--ber', '-e', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
|
|
330
331
|
@click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
|
|
331
|
-
def presence_profile(profile_file, stb_file, bed_file,
|
|
332
|
+
def presence_profile(profile_file, stb_file, bed_file, read_loc_file, min_cov_fug, fug_threshold, ber, output_file):
|
|
332
333
|
"""
|
|
333
|
-
Generate a presence profile
|
|
334
|
+
Generate a presence profile for genomes based on the given profile and read location data.
|
|
334
335
|
|
|
335
336
|
Args:
|
|
336
337
|
profile_file (str): Path to the profile Parquet file.
|
|
337
338
|
stb_file (str): Path to the scaffold-to-genome mapping file.
|
|
338
339
|
bed_file (str): Path to the BED file.
|
|
340
|
+
read_loc_file (str): Path to the read location table.
|
|
341
|
+
min_cov_fug (float): Minimum coverage to use fug.
|
|
342
|
+
fug_threshold (float): FUG threshold.
|
|
343
|
+
ber (float): Minimum ratio of breadth over expected breadth to consider presence.
|
|
344
|
+
output_file (str): Path to save the output Parquet file.
|
|
339
345
|
"""
|
|
340
|
-
profile=pl.scan_parquet(profile_file)
|
|
346
|
+
profile = pl.scan_parquet(profile_file)
|
|
341
347
|
stb = pl.scan_csv(stb_file, separator="\t", has_header=False).with_columns(
|
|
342
348
|
pl.col("column_1").alias("scaffold"),
|
|
343
349
|
pl.col("column_2").alias("genome")
|
|
344
350
|
).select(["scaffold", "genome"])
|
|
345
351
|
bed = pl.scan_csv(bed_file, separator="\t", has_header=False).with_columns(
|
|
346
352
|
pl.col("column_1").alias("scaffold"),
|
|
347
|
-
pl.col("column_2").alias("start"),
|
|
348
|
-
pl.col("column_3").alias("end")
|
|
353
|
+
pl.col("column_2").cast(pl.Int64).alias("start"),
|
|
354
|
+
pl.col("column_3").cast(pl.Int64).alias("end")
|
|
349
355
|
).select(["scaffold", "start", "end"])
|
|
350
|
-
|
|
356
|
+
read_loc_table = pl.scan_parquet(read_loc_file).rename({
|
|
357
|
+
"chrom":"scaffold",
|
|
358
|
+
"pos":"loc"
|
|
359
|
+
})
|
|
360
|
+
presence_df = ut.get_genome_stats(
|
|
351
361
|
profile=profile,
|
|
352
362
|
stb=stb,
|
|
353
363
|
bed=bed,
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
364
|
+
read_loc_table=read_loc_table,
|
|
365
|
+
min_cov_use_fug=min_cov_fug,
|
|
366
|
+
fug=fug_threshold,
|
|
367
|
+
ber=ber
|
|
368
|
+
)
|
|
369
|
+
presence_df.sink_parquet(output_file, compression='zstd')
|
|
370
|
+
|
|
371
|
+
@utilities.command("process-read-locs")
|
|
372
|
+
@click.option("--output-file", "-o", required=True, help="Path to save the processed read locations Parquet file.")
|
|
373
|
+
def process_read_locs(output_file):
|
|
374
|
+
"""
|
|
375
|
+
Process read locations and save them to a Parquet file.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
output_file (str): Path to save the output Parquet file.
|
|
379
|
+
"""
|
|
380
|
+
ut.process_read_location(output_file=pathlib.Path(output_file))
|
|
358
381
|
|
|
359
382
|
@cli.group()
|
|
360
383
|
def gene_tools():
|
|
361
384
|
"""Holds anything related to gene analysis."""
|
|
362
385
|
pass
|
|
363
386
|
|
|
387
|
+
@utilities.command("generate_stb")
|
|
388
|
+
@click.option('--genomes-dir-file', '-g', required=True, help="Path to the genomes directory file. A text file with each line containing a genome fasta file path.")
|
|
389
|
+
@click.option('--output-file', '-o', required=True, help="Path to save the output scaffold-to-genome mapping file.")
|
|
390
|
+
@click.option('--extension', '-e', default=".fasta", help="File extension of the genome fasta files.")
|
|
391
|
+
def generate_stb(genomes_dir_file, output_file, extension):
|
|
392
|
+
"""
|
|
393
|
+
Generate a scaffold-to-genome mapping file from the given genomes directory file.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
genomes_dir_file (str): Path to the genomes directory file.
|
|
397
|
+
output_file (str): Path to save the output scaffold-to-genome mapping file.
|
|
398
|
+
extension (str): File extension of the genome fasta files.
|
|
399
|
+
"""
|
|
400
|
+
with open(output_file, 'w') as out_f:
|
|
401
|
+
for genome in pathlib.Path(genomes_dir_file).glob(f"*{extension}"):
|
|
402
|
+
genome_name = genome.stem
|
|
403
|
+
with open(genome, 'r') as gf:
|
|
404
|
+
for line in gf:
|
|
405
|
+
if line.startswith('>'):
|
|
406
|
+
scaffold_name = line[1:].strip().split()[0]
|
|
407
|
+
out_f.write(f"{scaffold_name}\t{genome_name}\n")
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
|
|
364
412
|
|
|
365
413
|
@gene_tools.command("gene-range-table")
|
|
366
414
|
@click.option('--gene-file', '-g', required=True, help="location of gene file. Prodigal's nucleotide fasta output")
|
|
@@ -603,22 +651,34 @@ def prepare_profiling(reference_fasta, gene_fasta, stb_file, output_dir):
|
|
|
603
651
|
@profile.command("profile-single")
|
|
604
652
|
@click.option('--bed-file', '-b', required=True, help="Path to the BED file describing regions to be profiled.")
|
|
605
653
|
@click.option('--bam-file', '-a', required=True, help="Path to the BAM file to be profiled.")
|
|
654
|
+
@click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
|
|
606
655
|
@click.option('--gene-range-table', '-g', required=True, help="Path to the gene range table.")
|
|
607
656
|
@click.option('--num-workers', '-n', default=1, help="Number of workers to use for profiling.")
|
|
608
657
|
@click.option('--output-dir', '-o', required=True, help="Directory to save the profiling output.")
|
|
609
|
-
|
|
658
|
+
@click.option('--ber', '-r', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
|
|
659
|
+
@click.option('--fug', '-f', default=2.0, help="fraction of expected gaps (FUG) threshold.")
|
|
660
|
+
@click.option('--min-cov-use-fug', '-m', default=0.1, help="Minimum coverage to use FUG.")
|
|
661
|
+
def profile_single(bed_file, bam_file, stb_file, gene_range_table, num_workers, output_dir, ber, fug, min_cov_use_fug):
|
|
610
662
|
"""
|
|
611
663
|
Profile a single BAM file using the provided BED file and gene range table.
|
|
612
664
|
|
|
613
665
|
"""
|
|
614
666
|
output_dir=pathlib.Path(output_dir)
|
|
615
667
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
668
|
+
stb= pl.scan_csv(stb_file, separator='\t',has_header=False).with_columns(
|
|
669
|
+
pl.col("column_1").alias("scaffold"),
|
|
670
|
+
pl.col("column_2").alias("genome")
|
|
671
|
+
)
|
|
616
672
|
pf.profile_bam(
|
|
617
673
|
bed_file=bed_file,
|
|
618
674
|
bam_file=bam_file,
|
|
619
675
|
gene_range_table=gene_range_table,
|
|
676
|
+
stb=stb,
|
|
620
677
|
output_dir=output_dir,
|
|
621
|
-
num_workers=num_workers
|
|
678
|
+
num_workers=num_workers,
|
|
679
|
+
ber=ber,
|
|
680
|
+
fug=fug,
|
|
681
|
+
min_cov_use_fug=min_cov_use_fug
|
|
622
682
|
)
|
|
623
683
|
|
|
624
684
|
@cli.group()
|
|
@@ -147,13 +147,27 @@ async def _profile_chunk_task(
|
|
|
147
147
|
stdout, stderr = await proc.communicate()
|
|
148
148
|
if proc.returncode != 0:
|
|
149
149
|
raise Exception(f"Command failed with error: {stderr.decode().strip()}")
|
|
150
|
+
cmd=["samtools", "view", "-F", "132", "-L", str(bed_file.absolute()), str(bam_file.absolute()), "|", "zipstrain", "utilities", "process-read-locs", "--output-file", f"{bam_file.stem}_read_locs_{chunk_id}.parquet"]
|
|
151
|
+
proc = await asyncio.create_subprocess_shell(
|
|
152
|
+
" ".join(cmd),
|
|
153
|
+
stdout=asyncio.subprocess.PIPE,
|
|
154
|
+
stderr=asyncio.subprocess.PIPE,
|
|
155
|
+
cwd=output_dir
|
|
156
|
+
)
|
|
157
|
+
stdout, stderr = await proc.communicate()
|
|
158
|
+
if proc.returncode != 0:
|
|
159
|
+
raise Exception(f"Command failed with error: {stderr.decode().strip()}")
|
|
150
160
|
|
|
151
161
|
async def profile_bam_in_chunks(
|
|
152
162
|
bed_file:str,
|
|
153
163
|
bam_file:str,
|
|
154
164
|
gene_range_table:str,
|
|
165
|
+
stb:pl.LazyFrame,
|
|
155
166
|
output_dir:str,
|
|
156
|
-
num_workers:int=4
|
|
167
|
+
num_workers:int=4,
|
|
168
|
+
ber:float=0.5,
|
|
169
|
+
fug:float=2.0,
|
|
170
|
+
min_cov_use_fug:int=0.1
|
|
157
171
|
)->None:
|
|
158
172
|
"""
|
|
159
173
|
Profile a BAM file in chunks using provided BED files.
|
|
@@ -189,17 +203,46 @@ async def profile_bam_in_chunks(
|
|
|
189
203
|
chunk_id=chunk_id
|
|
190
204
|
))
|
|
191
205
|
await asyncio.gather(*tasks)
|
|
192
|
-
pfs=[output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet" for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
|
|
193
|
-
|
|
194
|
-
|
|
206
|
+
pfs=[(output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet", output_dir/"tmp"/f"{bam_file.stem}_read_locs_{chunk_id}.parquet" ) for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
|
|
207
|
+
|
|
208
|
+
mpile_container: list[pl.LazyFrame] = []
|
|
209
|
+
read_loc_pfs: list[pl.LazyFrame] = []
|
|
210
|
+
for pf, read_loc_pf in pfs:
|
|
211
|
+
mpile_container.append(pl.scan_parquet(pf).lazy())
|
|
212
|
+
read_loc_pfs.append(pl.scan_parquet(read_loc_pf).lazy())
|
|
213
|
+
|
|
214
|
+
mpileup_df = pl.concat(mpile_container)
|
|
215
|
+
mpileup_df.sink_parquet(output_dir/f"{bam_file.stem}_profile.parquet", compression='zstd', engine='streaming')
|
|
216
|
+
read_loc_df = pl.concat(read_loc_pfs).rename(
|
|
217
|
+
{
|
|
218
|
+
"chrom":"scaffold",
|
|
219
|
+
"pos":"loc",
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
utils.get_genome_stats(
|
|
224
|
+
profile=mpileup_df,
|
|
225
|
+
read_loc_table=read_loc_df,
|
|
226
|
+
stb=stb,
|
|
227
|
+
bed=bed_lf.rename({"column_1":"scaffold","column_2":"start","column_3":"end"}),
|
|
228
|
+
ber=ber,
|
|
229
|
+
fug=fug,
|
|
230
|
+
min_cov_use_fug=min_cov_use_fug,
|
|
231
|
+
).sink_parquet(output_dir/f"{bam_file.stem}_genome_stats.parquet", compression='zstd', engine='streaming')
|
|
232
|
+
|
|
233
|
+
|
|
195
234
|
os.system(f"rm -r {output_dir}/tmp")
|
|
196
235
|
|
|
197
236
|
def profile_bam(
|
|
198
237
|
bed_file:str,
|
|
199
238
|
bam_file:str,
|
|
200
239
|
gene_range_table:str,
|
|
240
|
+
stb:pl.LazyFrame,
|
|
201
241
|
output_dir:str,
|
|
202
|
-
num_workers:int=4
|
|
242
|
+
num_workers:int=4,
|
|
243
|
+
ber:float=0.5,
|
|
244
|
+
fug:float=2.0,
|
|
245
|
+
min_cov_use_fug:int=0.1
|
|
203
246
|
)->None:
|
|
204
247
|
"""
|
|
205
248
|
Profile a BAM file in chunks using provided BED files.
|
|
@@ -208,6 +251,7 @@ def profile_bam(
|
|
|
208
251
|
bed_file (list[pathlib.Path]): A bed file describing all regions to be profiled.
|
|
209
252
|
bam_file (pathlib.Path): Path to the BAM file.
|
|
210
253
|
gene_range_table (pathlib.Path): Path to the gene range table.
|
|
254
|
+
stb (pl.LazyFrame): Scaffold-to-bin mapping table.
|
|
211
255
|
output_dir (pathlib.Path): Directory to save output files.
|
|
212
256
|
num_workers (int): Number of concurrent workers to use.
|
|
213
257
|
"""
|
|
@@ -215,7 +259,11 @@ def profile_bam(
|
|
|
215
259
|
bed_file=bed_file,
|
|
216
260
|
bam_file=bam_file,
|
|
217
261
|
gene_range_table=gene_range_table,
|
|
262
|
+
stb=stb,
|
|
218
263
|
output_dir=output_dir,
|
|
219
|
-
num_workers=num_workers
|
|
264
|
+
num_workers=num_workers,
|
|
265
|
+
ber=ber,
|
|
266
|
+
fug=fug,
|
|
267
|
+
min_cov_use_fug=min_cov_use_fug
|
|
220
268
|
))
|
|
221
269
|
|
|
@@ -127,6 +127,9 @@ class Status(StrEnum):
|
|
|
127
127
|
SUCCESS = "success"
|
|
128
128
|
PENDING = "pending"
|
|
129
129
|
|
|
130
|
+
class Messages(StrEnum):
|
|
131
|
+
"""Enumeration of common messages used in task and batch management."""
|
|
132
|
+
CANCELLED_BY_USER = "Task was cancelled by a signal from the user."
|
|
130
133
|
|
|
131
134
|
class Input(ABC):
|
|
132
135
|
"""Abstract base class for task inputs. DO NOT INSTANTIATE DIRECTLY.
|
|
@@ -275,7 +278,6 @@ class IntOutput(Output):
|
|
|
275
278
|
raise ValueError(f"Output value for task {self.task.id} is not an integer.")
|
|
276
279
|
else:
|
|
277
280
|
return False
|
|
278
|
-
return False
|
|
279
281
|
|
|
280
282
|
|
|
281
283
|
class Engine(ABC):
|
|
@@ -391,6 +393,7 @@ class Task(ABC):
|
|
|
391
393
|
"""Asynchronously reads the task status from the .status file in the task directory."""
|
|
392
394
|
status_path = self.task_dir / ".status"
|
|
393
395
|
# read the status file if it exists
|
|
396
|
+
|
|
394
397
|
if status_path.exists():
|
|
395
398
|
raw = await read_file(status_path, self.file_semaphore)
|
|
396
399
|
self._status = raw.strip()
|
|
@@ -406,12 +409,11 @@ class Task(ABC):
|
|
|
406
409
|
except Exception:
|
|
407
410
|
all_ready = False
|
|
408
411
|
|
|
409
|
-
if all_ready:
|
|
412
|
+
if all_ready or self._batch_obj._cleaned_up:
|
|
410
413
|
self._status = Status.SUCCESS.value
|
|
411
|
-
|
|
414
|
+
|
|
412
415
|
else:
|
|
413
416
|
self._status = Status.FAILED.value
|
|
414
|
-
await write_file(status_path, Status.FAILED.value, self.file_semaphore)
|
|
415
417
|
raise ValueError(f"Task {self.id} reported done but outputs are not ready or invalid. {self.expected_outputs['output-file'].expected_file.absolute()}")
|
|
416
418
|
|
|
417
419
|
return self._status
|
|
@@ -521,13 +523,13 @@ class ProfileTaskGenerator(TaskGenerator):
|
|
|
521
523
|
"bed-file": FileInput(self.profile_bed_file),
|
|
522
524
|
"gene-range-table": FileInput(self.gene_range_file),
|
|
523
525
|
"genome-length-file": FileInput(self.genome_length_file),
|
|
524
|
-
"num-
|
|
526
|
+
"num-workers": IntInput(self.num_procs),
|
|
525
527
|
"breadth-min-cov": IntInput(self.breadth_min_cov),
|
|
526
528
|
}
|
|
527
529
|
expected_outputs ={
|
|
528
530
|
"profile": FileOutput(row["sample_name"]+".parquet" ),
|
|
529
|
-
"breadth": FileOutput(row["sample_name"]+"_breadth.parquet" ),
|
|
530
531
|
"scaffold": FileOutput(row["sample_name"]+".parquet.scaffolds" ),
|
|
532
|
+
"genome-stats": FileOutput(row["sample_name"]+"_genome_stats.parquet" ),
|
|
531
533
|
}
|
|
532
534
|
task = ProfileBamTask(id=row["sample_name"], inputs=inputs, expected_outputs=expected_outputs, engine=self.engine)
|
|
533
535
|
tasks.append(task)
|
|
@@ -637,9 +639,8 @@ class Batch(ABC):
|
|
|
637
639
|
task.map_io()
|
|
638
640
|
|
|
639
641
|
self._runner_obj:Runner = None
|
|
640
|
-
|
|
642
|
+
self._cleaned_up = False
|
|
641
643
|
|
|
642
|
-
|
|
643
644
|
def _get_initial_status(self) -> str:
|
|
644
645
|
"""Returns the initial status of the batch based on the presence of the batch directory."""
|
|
645
646
|
if not self.batch_dir.exists():
|
|
@@ -663,6 +664,7 @@ class Batch(ABC):
|
|
|
663
664
|
|
|
664
665
|
def cleanup(self) -> None:
|
|
665
666
|
"""The base class defines if any cleanup is needed after batch success. By default, it does nothing."""
|
|
667
|
+
self._cleaned_up = True
|
|
666
668
|
return None
|
|
667
669
|
|
|
668
670
|
@abstractmethod
|
|
@@ -718,33 +720,30 @@ class LocalBatch(Batch):
|
|
|
718
720
|
|
|
719
721
|
def __init__(self, tasks, id, run_dir, expected_outputs) -> None:
|
|
720
722
|
super().__init__(tasks, id, run_dir, expected_outputs)
|
|
721
|
-
self._script = self.TEMPLATE_CMD + "\nset -
|
|
723
|
+
self._script = self.TEMPLATE_CMD + "\nset -euo pipefail\n"
|
|
722
724
|
self._proc: asyncio.subprocess.Process | None = None
|
|
723
725
|
|
|
724
726
|
|
|
725
727
|
async def run(self) -> None:
|
|
726
728
|
"""This method runs all tasks in the batch locally by creating a shell script and executing it."""
|
|
727
|
-
if self.status != Status.SUCCESS
|
|
729
|
+
if self.status != Status.SUCCESS:
|
|
728
730
|
self.batch_dir.mkdir(parents=True, exist_ok=True)
|
|
731
|
+
|
|
729
732
|
self._status = Status.RUNNING.value
|
|
730
|
-
|
|
731
|
-
|
|
732
733
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
733
|
-
|
|
734
|
+
|
|
735
|
+
script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
|
|
736
|
+
script = self._script # Initialize the script content
|
|
737
|
+
|
|
734
738
|
for task in self.tasks:
|
|
735
|
-
if task.status
|
|
739
|
+
if task.status != Status.SUCCESS.value:
|
|
736
740
|
task.task_dir.mkdir(parents=True, exist_ok=True) # Create task directory
|
|
737
741
|
await write_file(task.task_dir / ".status", Status.NOT_STARTED.value, self.file_semaphore)
|
|
738
|
-
|
|
739
|
-
script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
|
|
740
|
-
|
|
741
|
-
script = self._script
|
|
742
|
-
for task in self.tasks:
|
|
743
|
-
if task.status == Status.NOT_STARTED.value or task.status == Status.FAILED.value:
|
|
744
742
|
script += f"\n{task.pre_run}\n{task.command}\n{task.post_run}\n"
|
|
745
743
|
|
|
744
|
+
|
|
745
|
+
|
|
746
746
|
await write_file(script_path, script, self.file_semaphore)
|
|
747
|
-
|
|
748
747
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
749
748
|
|
|
750
749
|
self._proc = await asyncio.create_subprocess_exec(
|
|
@@ -753,28 +752,37 @@ class LocalBatch(Batch):
|
|
|
753
752
|
stderr=asyncio.subprocess.PIPE,
|
|
754
753
|
cwd=self.batch_dir,
|
|
755
754
|
)
|
|
755
|
+
|
|
756
756
|
try:
|
|
757
757
|
out_bytes, err_bytes = await self._proc.communicate()
|
|
758
|
+
|
|
758
759
|
except asyncio.CancelledError:
|
|
759
760
|
if self._proc and self._proc.returncode is None:
|
|
760
761
|
self._proc.terminate()
|
|
761
|
-
|
|
762
|
+
await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
|
|
763
|
+
|
|
764
|
+
raise Exception
|
|
765
|
+
|
|
762
766
|
|
|
763
767
|
await write_file(self.batch_dir / f"{self.id}.out", out_bytes.decode(), self.file_semaphore)
|
|
764
768
|
await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
|
|
765
769
|
|
|
770
|
+
if self._proc.returncode != 0:
|
|
771
|
+
error=err_bytes.decode()
|
|
772
|
+
raise RuntimeError(f"Batch script failed with error:\n{error}")
|
|
773
|
+
|
|
766
774
|
if self._proc.returncode == 0 and self.outputs_ready():
|
|
767
775
|
self.cleanup()
|
|
768
776
|
self._status = Status.SUCCESS.value
|
|
769
777
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
778
|
+
|
|
770
779
|
else:
|
|
771
780
|
self._status = Status.FAILED.value
|
|
772
781
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
773
|
-
|
|
774
|
-
elif self.status == Status.SUCCESS.value and self.outputs_ready():
|
|
775
|
-
self._status = Status.SUCCESS.value
|
|
782
|
+
|
|
776
783
|
else:
|
|
777
|
-
self._status = Status.
|
|
784
|
+
self._status = Status.SUCCESS.value
|
|
785
|
+
|
|
778
786
|
|
|
779
787
|
def _parse_job_id(self, sbatch_output):
|
|
780
788
|
return super()._parse_job_id(sbatch_output)
|
|
@@ -810,7 +818,7 @@ class SlurmBatch(Batch):
|
|
|
810
818
|
super().__init__(tasks, id, run_dir, expected_outputs)
|
|
811
819
|
self._check_slurm_works()
|
|
812
820
|
self.slurm_config = slurm_config
|
|
813
|
-
self._script = self.TEMPLATE_CMD + self.slurm_config.to_slurm_args() + "\nset -
|
|
821
|
+
self._script = self.TEMPLATE_CMD + self.slurm_config.to_slurm_args() + "\nset -euo pipefail\n"
|
|
814
822
|
self._job_id = None
|
|
815
823
|
|
|
816
824
|
def _check_slurm_works(self) -> None:
|
|
@@ -1450,7 +1458,7 @@ class ProfileBamTask(Task):
|
|
|
1450
1458
|
|
|
1451
1459
|
- gene-range-table: A BED file specifying the gene ranges for the sample.
|
|
1452
1460
|
|
|
1453
|
-
- num-
|
|
1461
|
+
- num-workers: The number of concurrent workers to use for processing.
|
|
1454
1462
|
|
|
1455
1463
|
- genome-length-file: A file containing the lengths of the genomes in the reference fasta.
|
|
1456
1464
|
|
|
@@ -1470,15 +1478,12 @@ class ProfileBamTask(Task):
|
|
|
1470
1478
|
zipstrain profile profile-single --bam-file input.bam \
|
|
1471
1479
|
--bed-file bed_file.bed \
|
|
1472
1480
|
--gene-range-table gene-range-table.bed \
|
|
1481
|
+
--stb-file <stb-file> \
|
|
1473
1482
|
--num-workers <num-workers> \
|
|
1474
1483
|
--output-dir .
|
|
1475
|
-
mv
|
|
1484
|
+
mv input_profile.parquet <sample-name>.parquet
|
|
1485
|
+
mv input_genome_stats.parquet <sample-name>_genome_stats.parquet
|
|
1476
1486
|
samtools idxstats <bam-file> | awk '$3 > 0 {print $1}' > <sample-name>.parquet.scaffolds
|
|
1477
|
-
zipstrain utilities genome_breadth_matrix --profile <sample-name>.parquet \
|
|
1478
|
-
--genome-length <genome-length-file> \
|
|
1479
|
-
--stb <stb-file> \
|
|
1480
|
-
--min-cov <breadth-min-cov> \
|
|
1481
|
-
--output-file <sample-name>_breadth.parquet
|
|
1482
1487
|
"""
|
|
1483
1488
|
|
|
1484
1489
|
class FastCompareTask(Task):
|
|
@@ -1552,6 +1557,7 @@ class FastCompareLocalBatch(LocalBatch):
|
|
|
1552
1557
|
for task in tasks_to_remove:
|
|
1553
1558
|
self.tasks.remove(task)
|
|
1554
1559
|
shutil.rmtree(task.task_dir)
|
|
1560
|
+
self._cleaned_up = True
|
|
1555
1561
|
|
|
1556
1562
|
class FastCompareSlurmBatch(SlurmBatch):
|
|
1557
1563
|
"""A SlurmBatch that runs FastCompareTask tasks on a Slurm cluster. Maybe removed in future"""
|
|
@@ -1560,6 +1566,8 @@ class FastCompareSlurmBatch(SlurmBatch):
|
|
|
1560
1566
|
for task in tasks_to_remove:
|
|
1561
1567
|
self.tasks.remove(task)
|
|
1562
1568
|
shutil.rmtree(task.task_dir)
|
|
1569
|
+
|
|
1570
|
+
self._cleaned_up = True
|
|
1563
1571
|
|
|
1564
1572
|
class PrepareCompareGenomeRunOutputsLocalBatch(LocalBatch):
|
|
1565
1573
|
pass
|
|
@@ -1925,8 +1933,11 @@ class FastGeneCompareLocalBatch(LocalBatch):
|
|
|
1925
1933
|
def cleanup(self) -> None:
|
|
1926
1934
|
tasks_to_remove = [task for task in self.tasks if isinstance(task, FastGeneCompareTask)]
|
|
1927
1935
|
for task in tasks_to_remove:
|
|
1936
|
+
task._status=Status.SUCCESS
|
|
1928
1937
|
self.tasks.remove(task)
|
|
1929
1938
|
shutil.rmtree(task.task_dir)
|
|
1939
|
+
self._cleaned_up = True
|
|
1940
|
+
|
|
1930
1941
|
|
|
1931
1942
|
class FastGeneCompareSlurmBatch(SlurmBatch):
|
|
1932
1943
|
"""A SlurmBatch that runs FastGeneCompareTask tasks on a Slurm cluster."""
|
|
@@ -1935,6 +1946,7 @@ class FastGeneCompareSlurmBatch(SlurmBatch):
|
|
|
1935
1946
|
for task in tasks_to_remove:
|
|
1936
1947
|
self.tasks.remove(task)
|
|
1937
1948
|
shutil.rmtree(task.task_dir)
|
|
1949
|
+
self._cleaned_up = True
|
|
1938
1950
|
|
|
1939
1951
|
class PrepareGeneCompareRunOutputsLocalBatch(LocalBatch):
|
|
1940
1952
|
pass
|
|
@@ -140,7 +140,7 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
|
|
|
140
140
|
|
|
141
141
|
if writer is None:
|
|
142
142
|
# Open writer for the first time
|
|
143
|
-
writer = pq.ParquetWriter(output_file, schema, compression='
|
|
143
|
+
writer = pq.ParquetWriter(output_file, schema, compression='zstd')
|
|
144
144
|
writer.write_table(pa.Table.from_batches([batch]))
|
|
145
145
|
|
|
146
146
|
# Clear buffers
|
|
@@ -180,6 +180,51 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
|
|
|
180
180
|
if writer:
|
|
181
181
|
writer.close()
|
|
182
182
|
|
|
183
|
+
def process_read_location(output_file:str, batch_size:int=10000)->None:
|
|
184
|
+
"""
|
|
185
|
+
This function takes the output of samtools view -F 132 and processes it to extract read locations in a parquet file.
|
|
186
|
+
"""
|
|
187
|
+
schema = pa.schema([
|
|
188
|
+
('chrom', pa.string()),
|
|
189
|
+
('pos', pa.int32()),
|
|
190
|
+
])
|
|
191
|
+
writer = None
|
|
192
|
+
chroms = []
|
|
193
|
+
positions = []
|
|
194
|
+
def flush_batch():
|
|
195
|
+
nonlocal writer
|
|
196
|
+
if not chroms:
|
|
197
|
+
return
|
|
198
|
+
batch = pa.RecordBatch.from_arrays([
|
|
199
|
+
pa.array(chroms, type=pa.string()),
|
|
200
|
+
pa.array(positions, type=pa.int32()),
|
|
201
|
+
], schema=schema)
|
|
202
|
+
|
|
203
|
+
if writer is None:
|
|
204
|
+
# Open writer for the first time
|
|
205
|
+
writer = pq.ParquetWriter(output_file, schema, compression='zstd')
|
|
206
|
+
writer.write_table(pa.Table.from_batches([batch]))
|
|
207
|
+
|
|
208
|
+
# Clear buffers
|
|
209
|
+
chroms.clear()
|
|
210
|
+
positions.clear()
|
|
211
|
+
for line in sys.stdin:
|
|
212
|
+
if not line.strip():
|
|
213
|
+
continue
|
|
214
|
+
fields = line.strip().split('\t')
|
|
215
|
+
if len(fields) < 4:
|
|
216
|
+
continue
|
|
217
|
+
chrom, pos = fields[2], fields[3]
|
|
218
|
+
chroms.append(chrom)
|
|
219
|
+
positions.append(int(pos))
|
|
220
|
+
if len(chroms) >= batch_size:
|
|
221
|
+
flush_batch()
|
|
222
|
+
# Flush remaining data
|
|
223
|
+
flush_batch()
|
|
224
|
+
if writer:
|
|
225
|
+
writer.close()
|
|
226
|
+
|
|
227
|
+
|
|
183
228
|
def extract_genome_length(stb: pl.LazyFrame, bed_table: pl.LazyFrame) -> pl.LazyFrame:
|
|
184
229
|
"""
|
|
185
230
|
Extract the genome length information from the scaffold-to-genome mapping table.
|
|
@@ -350,102 +395,106 @@ def split_lf_to_chunks(lf:pl.LazyFrame,num_chunks:int)->list[pl.LazyFrame]:
|
|
|
350
395
|
return chunks
|
|
351
396
|
|
|
352
397
|
|
|
353
|
-
def
|
|
398
|
+
def get_genome_gaps(
|
|
399
|
+
read_loc_table: pl.LazyFrame,
|
|
400
|
+
stb: pl.LazyFrame,
|
|
401
|
+
genome_length: pl.LazyFrame,
|
|
402
|
+
)-> pl.LazyFrame:
|
|
403
|
+
read_loc_table=read_loc_table.sort(["scaffold",'loc'])
|
|
404
|
+
read_loc_table=read_loc_table.with_columns(
|
|
405
|
+
(pl.col("loc") - pl.col("loc").shift(1).over("scaffold")).alias("gap_length")
|
|
406
|
+
).join(
|
|
407
|
+
stb,
|
|
408
|
+
on="scaffold",
|
|
409
|
+
how="left"
|
|
410
|
+
)
|
|
411
|
+
delta=read_loc_table.group_by("genome").agg(
|
|
412
|
+
rn=pl.len()).join(
|
|
413
|
+
genome_length,
|
|
414
|
+
on="genome",
|
|
415
|
+
how="left"
|
|
416
|
+
).with_columns(
|
|
417
|
+
delta=(pl.col("genome_length")/pl.col("rn")).round().alias("delta")).select(
|
|
418
|
+
pl.col("genome"),
|
|
419
|
+
pl.col("delta"),
|
|
420
|
+
pl.col("rn")
|
|
421
|
+
)
|
|
422
|
+
read_loc_table=read_loc_table.join(
|
|
423
|
+
delta,
|
|
424
|
+
on="genome",
|
|
425
|
+
how="left"
|
|
426
|
+
)
|
|
427
|
+
read_loc_table=read_loc_table.filter(
|
|
428
|
+
pl.col("gap_length") > pl.col("delta")
|
|
429
|
+
).group_by(["genome","gap_length"]).agg(
|
|
430
|
+
pd=(pl.len()/(pl.col("rn").first()-1)),
|
|
431
|
+
delta=pl.col("delta").first()
|
|
432
|
+
).with_columns(
|
|
433
|
+
pd= pl.col("pd") * (pl.col("gap_length")-pl.col("delta"))
|
|
434
|
+
).group_by("genome").agg(
|
|
435
|
+
fug=(pl.col("delta").first()-pl.col("pd").sum())/pl.col("delta").first()
|
|
436
|
+
)
|
|
437
|
+
return read_loc_table.select(
|
|
438
|
+
pl.col("genome"),
|
|
439
|
+
pl.col("fug")
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
def get_genome_stats(
|
|
354
443
|
profile:pl.LazyFrame,
|
|
355
444
|
bed: pl.LazyFrame,
|
|
356
445
|
stb: pl.LazyFrame,
|
|
446
|
+
read_loc_table: pl.LazyFrame,
|
|
357
447
|
ber:float=0.5,
|
|
358
|
-
|
|
359
|
-
|
|
448
|
+
fug:float=2,
|
|
449
|
+
min_cov_use_fug:int=0.1
|
|
360
450
|
)->pl.LazyFrame:
|
|
361
|
-
"""
|
|
362
|
-
This function estimates the presence of genomes in a sample based on coverage information.
|
|
363
|
-
as long as the coverage is above a certain threshold. BER is used to decide the threshold.
|
|
364
|
-
However, if the coverage is below the threshold, the coefficient of variation (CV) is used instead as
|
|
365
|
-
a more reliable metric for low-coverage scenarios.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
profile (pl.LazyFrame): The profile LazyFrame containing coverage information.
|
|
369
|
-
bed (pl.LazyFrame): The BED table containing genomic regions.
|
|
370
|
-
stb (pl.LazyFrame): The scaffold-to-bin mapping LazyFrame.
|
|
371
|
-
ber (float): Breadth over expected breadth ratio threshold for genome presence.
|
|
372
|
-
cv_threshold (float): Coefficient of variation threshold for genome presence.
|
|
373
|
-
min_cov_constant_poisson (int): Minimum coverage threshold to use BER for presence estimation.
|
|
374
|
-
|
|
375
|
-
Returns:
|
|
376
|
-
pl.LazyFrame: A LazyFrame containing genome presence information.
|
|
377
|
-
"""
|
|
378
|
-
profile=profile.with_columns(
|
|
379
|
-
(pl.col("A")+pl.col("T")+pl.col("C")+pl.col("G")).alias("coverage")
|
|
380
|
-
)
|
|
381
|
-
starts_df=bed.select(
|
|
382
|
-
pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
|
|
383
|
-
pl.col("start").cast(profile.collect_schema()["pos"]).alias("pos"),
|
|
384
|
-
pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
|
|
385
|
-
pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
|
|
386
|
-
pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
|
|
387
|
-
pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
|
|
388
|
-
pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
|
|
389
|
-
pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
|
|
390
|
-
)
|
|
391
|
-
ends_df=bed.select(
|
|
392
|
-
pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
|
|
393
|
-
(pl.col("end")-1).cast(profile.collect_schema()["pos"]).alias("pos"),
|
|
394
|
-
pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
|
|
395
|
-
pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
|
|
396
|
-
pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
|
|
397
|
-
pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
|
|
398
|
-
pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
|
|
399
|
-
pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
|
|
400
|
-
)
|
|
401
451
|
|
|
402
|
-
|
|
403
|
-
|
|
452
|
+
genome_lengths=extract_genome_length(stb, bed)
|
|
453
|
+
genome_gap_stats= get_genome_gaps(read_loc_table, stb, genome_lengths)
|
|
454
|
+
profile=profile.join(
|
|
404
455
|
stb,
|
|
405
|
-
|
|
456
|
+
left_on="chrom",
|
|
457
|
+
right_on="scaffold",
|
|
406
458
|
how="left"
|
|
407
|
-
).group_by("genome").agg(
|
|
408
|
-
genome_length=(pl.col("end") - pl.col("start")).sum()
|
|
409
459
|
).select(
|
|
460
|
+
pl.col("chrom"),
|
|
410
461
|
pl.col("genome"),
|
|
411
|
-
pl.col("
|
|
462
|
+
(pl.col("A")+pl.col("C")+pl.col("G")+pl.col("T")).alias("coverage")
|
|
412
463
|
)
|
|
413
|
-
profile=profile.
|
|
414
|
-
pl.
|
|
415
|
-
|
|
416
|
-
(pl.col("pos") - pl.col("prev_pos")).clip(lower_bound=1).alias("gap_size")
|
|
464
|
+
profile=profile.group_by("genome").agg(
|
|
465
|
+
total_covered_sites=pl.len(),
|
|
466
|
+
coverage=pl.col("coverage").sum()
|
|
417
467
|
).join(
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
right_on="scaffold",
|
|
468
|
+
genome_lengths,
|
|
469
|
+
on="genome",
|
|
421
470
|
how="left"
|
|
422
|
-
).group_by("genome").agg(
|
|
423
|
-
cv=pl.col("gap_size").filter(pl.col("gap_size") > 1).std()/pl.col("gap_size").filter(pl.col("gap_size") > 1).mean(),
|
|
424
|
-
total_coverage=pl.col("coverage").sum(),
|
|
425
|
-
covered_positions=(pl.col("coverage")>0).sum()
|
|
426
471
|
).join(
|
|
427
|
-
|
|
472
|
+
genome_gap_stats,
|
|
428
473
|
on="genome",
|
|
429
474
|
how="left"
|
|
430
475
|
).with_columns(
|
|
431
|
-
(pl.col("
|
|
432
|
-
(pl.col("
|
|
433
|
-
).select(
|
|
434
|
-
pl.col("genome"),
|
|
435
|
-
pl.col("cv"),
|
|
436
|
-
pl.col("breadth"),
|
|
437
|
-
pl.col("coverage"),
|
|
476
|
+
coverage=(pl.col("coverage")/pl.col("genome_length")),
|
|
477
|
+
breadth=(pl.col("total_covered_sites")/pl.col("genome_length")),
|
|
438
478
|
).with_columns(
|
|
439
|
-
|
|
479
|
+
ber=pl.col("breadth")/(1-(-0.883*pl.col("coverage")).exp()),
|
|
480
|
+
fug=pl.col("fug")
|
|
440
481
|
).with_columns(
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
482
|
+
|
|
483
|
+
pl.when(pl.col("coverage") > min_cov_use_fug)
|
|
484
|
+
.then(
|
|
485
|
+
pl.col("ber") > ber
|
|
445
486
|
).otherwise(
|
|
446
|
-
(pl.col("
|
|
447
|
-
|
|
487
|
+
(pl.col("fug")/0.632 < fug) &
|
|
488
|
+
(pl.col("ber") > ber)
|
|
489
|
+
).fill_null(False).alias("is_present"))
|
|
490
|
+
|
|
491
|
+
return profile.select(
|
|
492
|
+
pl.col("genome"),
|
|
493
|
+
pl.col("coverage"),
|
|
494
|
+
pl.col("breadth"),
|
|
495
|
+
pl.col("ber"),
|
|
496
|
+
pl.col("fug"),
|
|
497
|
+
pl.col("is_present")
|
|
448
498
|
)
|
|
449
499
|
|
|
450
|
-
|
|
451
|
-
|
|
500
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|