zipstrain 0.2.8__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {zipstrain-0.2.8 → zipstrain-0.2.11}/PKG-INFO +1 -1
- {zipstrain-0.2.8 → zipstrain-0.2.11}/pyproject.toml +1 -1
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/cli.py +75 -15
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/profile.py +54 -6
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/task_manager.py +35 -28
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/utils.py +127 -78
- {zipstrain-0.2.8 → zipstrain-0.2.11}/README.md +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/__init__.py +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/compare.py +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/database.py +0 -0
- {zipstrain-0.2.8 → zipstrain-0.2.11}/src/zipstrain/visualize.py +0 -0
|
@@ -324,43 +324,91 @@ def to_complete_table(genome_comparison_object, output_file):
|
|
|
324
324
|
@click.option('--profile-file', '-p', required=True, help="Path to the profile Parquet file.")
|
|
325
325
|
@click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
|
|
326
326
|
@click.option('--bed-file', '-b', required=True, help="Path to the BED file.")
|
|
327
|
-
@click.option('--
|
|
328
|
-
@click.option('--
|
|
329
|
-
@click.option('--
|
|
327
|
+
@click.option('--read-loc-file', '-r', required=True, help="Path to the read location table.")
|
|
328
|
+
@click.option('--min-cov-fug', '-c', default=0.1, help="Minimum coverage to use fug.")
|
|
329
|
+
@click.option('--fug-threshold', '-f', default=2, help="FUG threshold.")
|
|
330
|
+
@click.option('--ber', '-e', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
|
|
330
331
|
@click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
|
|
331
|
-
def presence_profile(profile_file, stb_file, bed_file,
|
|
332
|
+
def presence_profile(profile_file, stb_file, bed_file, read_loc_file, min_cov_fug, fug_threshold, ber, output_file):
|
|
332
333
|
"""
|
|
333
|
-
Generate a presence profile
|
|
334
|
+
Generate a presence profile for genomes based on the given profile and read location data.
|
|
334
335
|
|
|
335
336
|
Args:
|
|
336
337
|
profile_file (str): Path to the profile Parquet file.
|
|
337
338
|
stb_file (str): Path to the scaffold-to-genome mapping file.
|
|
338
339
|
bed_file (str): Path to the BED file.
|
|
340
|
+
read_loc_file (str): Path to the read location table.
|
|
341
|
+
min_cov_fug (float): Minimum coverage to use fug.
|
|
342
|
+
fug_threshold (float): FUG threshold.
|
|
343
|
+
ber (float): Minimum ratio of breadth over expected breadth to consider presence.
|
|
344
|
+
output_file (str): Path to save the output Parquet file.
|
|
339
345
|
"""
|
|
340
|
-
profile=pl.scan_parquet(profile_file)
|
|
346
|
+
profile = pl.scan_parquet(profile_file)
|
|
341
347
|
stb = pl.scan_csv(stb_file, separator="\t", has_header=False).with_columns(
|
|
342
348
|
pl.col("column_1").alias("scaffold"),
|
|
343
349
|
pl.col("column_2").alias("genome")
|
|
344
350
|
).select(["scaffold", "genome"])
|
|
345
351
|
bed = pl.scan_csv(bed_file, separator="\t", has_header=False).with_columns(
|
|
346
352
|
pl.col("column_1").alias("scaffold"),
|
|
347
|
-
pl.col("column_2").alias("start"),
|
|
348
|
-
pl.col("column_3").alias("end")
|
|
353
|
+
pl.col("column_2").cast(pl.Int64).alias("start"),
|
|
354
|
+
pl.col("column_3").cast(pl.Int64).alias("end")
|
|
349
355
|
).select(["scaffold", "start", "end"])
|
|
350
|
-
|
|
356
|
+
read_loc_table = pl.scan_parquet(read_loc_file).rename({
|
|
357
|
+
"chrom":"scaffold",
|
|
358
|
+
"pos":"loc"
|
|
359
|
+
})
|
|
360
|
+
presence_df = ut.get_genome_stats(
|
|
351
361
|
profile=profile,
|
|
352
362
|
stb=stb,
|
|
353
363
|
bed=bed,
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
364
|
+
read_loc_table=read_loc_table,
|
|
365
|
+
min_cov_use_fug=min_cov_fug,
|
|
366
|
+
fug=fug_threshold,
|
|
367
|
+
ber=ber
|
|
368
|
+
)
|
|
369
|
+
presence_df.sink_parquet(output_file, compression='zstd')
|
|
370
|
+
|
|
371
|
+
@utilities.command("process-read-locs")
|
|
372
|
+
@click.option("--output-file", "-o", required=True, help="Path to save the processed read locations Parquet file.")
|
|
373
|
+
def process_read_locs(output_file):
|
|
374
|
+
"""
|
|
375
|
+
Process read locations and save them to a Parquet file.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
output_file (str): Path to save the output Parquet file.
|
|
379
|
+
"""
|
|
380
|
+
ut.process_read_location(output_file=pathlib.Path(output_file))
|
|
358
381
|
|
|
359
382
|
@cli.group()
|
|
360
383
|
def gene_tools():
|
|
361
384
|
"""Holds anything related to gene analysis."""
|
|
362
385
|
pass
|
|
363
386
|
|
|
387
|
+
@utilities.command("generate_stb")
|
|
388
|
+
@click.option('--genomes-dir-file', '-g', required=True, help="Path to the genomes directory file. A text file with each line containing a genome fasta file path.")
|
|
389
|
+
@click.option('--output-file', '-o', required=True, help="Path to save the output scaffold-to-genome mapping file.")
|
|
390
|
+
@click.option('--extension', '-e', default=".fasta", help="File extension of the genome fasta files.")
|
|
391
|
+
def generate_stb(genomes_dir_file, output_file, extension):
|
|
392
|
+
"""
|
|
393
|
+
Generate a scaffold-to-genome mapping file from the given genomes directory file.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
genomes_dir_file (str): Path to the genomes directory file.
|
|
397
|
+
output_file (str): Path to save the output scaffold-to-genome mapping file.
|
|
398
|
+
extension (str): File extension of the genome fasta files.
|
|
399
|
+
"""
|
|
400
|
+
with open(output_file, 'w') as out_f:
|
|
401
|
+
for genome in pathlib.Path(genomes_dir_file).glob(f"*{extension}"):
|
|
402
|
+
genome_name = genome.stem
|
|
403
|
+
with open(genome, 'r') as gf:
|
|
404
|
+
for line in gf:
|
|
405
|
+
if line.startswith('>'):
|
|
406
|
+
scaffold_name = line[1:].strip().split()[0]
|
|
407
|
+
out_f.write(f"{scaffold_name}\t{genome_name}\n")
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
|
|
364
412
|
|
|
365
413
|
@gene_tools.command("gene-range-table")
|
|
366
414
|
@click.option('--gene-file', '-g', required=True, help="location of gene file. Prodigal's nucleotide fasta output")
|
|
@@ -603,22 +651,34 @@ def prepare_profiling(reference_fasta, gene_fasta, stb_file, output_dir):
|
|
|
603
651
|
@profile.command("profile-single")
|
|
604
652
|
@click.option('--bed-file', '-b', required=True, help="Path to the BED file describing regions to be profiled.")
|
|
605
653
|
@click.option('--bam-file', '-a', required=True, help="Path to the BAM file to be profiled.")
|
|
654
|
+
@click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
|
|
606
655
|
@click.option('--gene-range-table', '-g', required=True, help="Path to the gene range table.")
|
|
607
656
|
@click.option('--num-workers', '-n', default=1, help="Number of workers to use for profiling.")
|
|
608
657
|
@click.option('--output-dir', '-o', required=True, help="Directory to save the profiling output.")
|
|
609
|
-
|
|
658
|
+
@click.option('--ber', '-r', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
|
|
659
|
+
@click.option('--fug', '-f', default=2.0, help="fraction of expected gaps (FUG) threshold.")
|
|
660
|
+
@click.option('--min-cov-use-fug', '-m', default=0.1, help="Minimum coverage to use FUG.")
|
|
661
|
+
def profile_single(bed_file, bam_file, stb_file, gene_range_table, num_workers, output_dir, ber, fug, min_cov_use_fug):
|
|
610
662
|
"""
|
|
611
663
|
Profile a single BAM file using the provided BED file and gene range table.
|
|
612
664
|
|
|
613
665
|
"""
|
|
614
666
|
output_dir=pathlib.Path(output_dir)
|
|
615
667
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
668
|
+
stb= pl.scan_csv(stb_file, separator='\t',has_header=False).with_columns(
|
|
669
|
+
pl.col("column_1").alias("scaffold"),
|
|
670
|
+
pl.col("column_2").alias("genome")
|
|
671
|
+
)
|
|
616
672
|
pf.profile_bam(
|
|
617
673
|
bed_file=bed_file,
|
|
618
674
|
bam_file=bam_file,
|
|
619
675
|
gene_range_table=gene_range_table,
|
|
676
|
+
stb=stb,
|
|
620
677
|
output_dir=output_dir,
|
|
621
|
-
num_workers=num_workers
|
|
678
|
+
num_workers=num_workers,
|
|
679
|
+
ber=ber,
|
|
680
|
+
fug=fug,
|
|
681
|
+
min_cov_use_fug=min_cov_use_fug
|
|
622
682
|
)
|
|
623
683
|
|
|
624
684
|
@cli.group()
|
|
@@ -147,13 +147,27 @@ async def _profile_chunk_task(
|
|
|
147
147
|
stdout, stderr = await proc.communicate()
|
|
148
148
|
if proc.returncode != 0:
|
|
149
149
|
raise Exception(f"Command failed with error: {stderr.decode().strip()}")
|
|
150
|
+
cmd=["samtools", "view", "-F", "132", "-L", str(bed_file.absolute()), str(bam_file.absolute()), "|", "zipstrain", "utilities", "process-read-locs", "--output-file", f"{bam_file.stem}_read_locs_{chunk_id}.parquet"]
|
|
151
|
+
proc = await asyncio.create_subprocess_shell(
|
|
152
|
+
" ".join(cmd),
|
|
153
|
+
stdout=asyncio.subprocess.PIPE,
|
|
154
|
+
stderr=asyncio.subprocess.PIPE,
|
|
155
|
+
cwd=output_dir
|
|
156
|
+
)
|
|
157
|
+
stdout, stderr = await proc.communicate()
|
|
158
|
+
if proc.returncode != 0:
|
|
159
|
+
raise Exception(f"Command failed with error: {stderr.decode().strip()}")
|
|
150
160
|
|
|
151
161
|
async def profile_bam_in_chunks(
|
|
152
162
|
bed_file:str,
|
|
153
163
|
bam_file:str,
|
|
154
164
|
gene_range_table:str,
|
|
165
|
+
stb:pl.LazyFrame,
|
|
155
166
|
output_dir:str,
|
|
156
|
-
num_workers:int=4
|
|
167
|
+
num_workers:int=4,
|
|
168
|
+
ber:float=0.5,
|
|
169
|
+
fug:float=2.0,
|
|
170
|
+
min_cov_use_fug:int=0.1
|
|
157
171
|
)->None:
|
|
158
172
|
"""
|
|
159
173
|
Profile a BAM file in chunks using provided BED files.
|
|
@@ -189,17 +203,46 @@ async def profile_bam_in_chunks(
|
|
|
189
203
|
chunk_id=chunk_id
|
|
190
204
|
))
|
|
191
205
|
await asyncio.gather(*tasks)
|
|
192
|
-
pfs=[output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet" for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
|
|
193
|
-
|
|
194
|
-
|
|
206
|
+
pfs=[(output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet", output_dir/"tmp"/f"{bam_file.stem}_read_locs_{chunk_id}.parquet" ) for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
|
|
207
|
+
|
|
208
|
+
mpile_container: list[pl.LazyFrame] = []
|
|
209
|
+
read_loc_pfs: list[pl.LazyFrame] = []
|
|
210
|
+
for pf, read_loc_pf in pfs:
|
|
211
|
+
mpile_container.append(pl.scan_parquet(pf).lazy())
|
|
212
|
+
read_loc_pfs.append(pl.scan_parquet(read_loc_pf).lazy())
|
|
213
|
+
|
|
214
|
+
mpileup_df = pl.concat(mpile_container)
|
|
215
|
+
mpileup_df.sink_parquet(output_dir/f"{bam_file.stem}_profile.parquet", compression='zstd', engine='streaming')
|
|
216
|
+
read_loc_df = pl.concat(read_loc_pfs).rename(
|
|
217
|
+
{
|
|
218
|
+
"chrom":"scaffold",
|
|
219
|
+
"pos":"loc",
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
utils.get_genome_stats(
|
|
224
|
+
profile=mpileup_df,
|
|
225
|
+
read_loc_table=read_loc_df,
|
|
226
|
+
stb=stb,
|
|
227
|
+
bed=bed_lf.rename({"column_1":"scaffold","column_2":"start","column_3":"end"}),
|
|
228
|
+
ber=ber,
|
|
229
|
+
fug=fug,
|
|
230
|
+
min_cov_use_fug=min_cov_use_fug,
|
|
231
|
+
).sink_parquet(output_dir/f"{bam_file.stem}_genome_stats.parquet", compression='zstd', engine='streaming')
|
|
232
|
+
|
|
233
|
+
|
|
195
234
|
os.system(f"rm -r {output_dir}/tmp")
|
|
196
235
|
|
|
197
236
|
def profile_bam(
|
|
198
237
|
bed_file:str,
|
|
199
238
|
bam_file:str,
|
|
200
239
|
gene_range_table:str,
|
|
240
|
+
stb:pl.LazyFrame,
|
|
201
241
|
output_dir:str,
|
|
202
|
-
num_workers:int=4
|
|
242
|
+
num_workers:int=4,
|
|
243
|
+
ber:float=0.5,
|
|
244
|
+
fug:float=2.0,
|
|
245
|
+
min_cov_use_fug:int=0.1
|
|
203
246
|
)->None:
|
|
204
247
|
"""
|
|
205
248
|
Profile a BAM file in chunks using provided BED files.
|
|
@@ -208,6 +251,7 @@ def profile_bam(
|
|
|
208
251
|
bed_file (list[pathlib.Path]): A bed file describing all regions to be profiled.
|
|
209
252
|
bam_file (pathlib.Path): Path to the BAM file.
|
|
210
253
|
gene_range_table (pathlib.Path): Path to the gene range table.
|
|
254
|
+
stb (pl.LazyFrame): Scaffold-to-bin mapping table.
|
|
211
255
|
output_dir (pathlib.Path): Directory to save output files.
|
|
212
256
|
num_workers (int): Number of concurrent workers to use.
|
|
213
257
|
"""
|
|
@@ -215,7 +259,11 @@ def profile_bam(
|
|
|
215
259
|
bed_file=bed_file,
|
|
216
260
|
bam_file=bam_file,
|
|
217
261
|
gene_range_table=gene_range_table,
|
|
262
|
+
stb=stb,
|
|
218
263
|
output_dir=output_dir,
|
|
219
|
-
num_workers=num_workers
|
|
264
|
+
num_workers=num_workers,
|
|
265
|
+
ber=ber,
|
|
266
|
+
fug=fug,
|
|
267
|
+
min_cov_use_fug=min_cov_use_fug
|
|
220
268
|
))
|
|
221
269
|
|
|
@@ -127,6 +127,9 @@ class Status(StrEnum):
|
|
|
127
127
|
SUCCESS = "success"
|
|
128
128
|
PENDING = "pending"
|
|
129
129
|
|
|
130
|
+
class Messages(StrEnum):
|
|
131
|
+
"""Enumeration of common messages used in task and batch management."""
|
|
132
|
+
CANCELLED_BY_USER = "Task was cancelled by a signal from the user."
|
|
130
133
|
|
|
131
134
|
class Input(ABC):
|
|
132
135
|
"""Abstract base class for task inputs. DO NOT INSTANTIATE DIRECTLY.
|
|
@@ -275,7 +278,6 @@ class IntOutput(Output):
|
|
|
275
278
|
raise ValueError(f"Output value for task {self.task.id} is not an integer.")
|
|
276
279
|
else:
|
|
277
280
|
return False
|
|
278
|
-
return False
|
|
279
281
|
|
|
280
282
|
|
|
281
283
|
class Engine(ABC):
|
|
@@ -391,6 +393,7 @@ class Task(ABC):
|
|
|
391
393
|
"""Asynchronously reads the task status from the .status file in the task directory."""
|
|
392
394
|
status_path = self.task_dir / ".status"
|
|
393
395
|
# read the status file if it exists
|
|
396
|
+
|
|
394
397
|
if status_path.exists():
|
|
395
398
|
raw = await read_file(status_path, self.file_semaphore)
|
|
396
399
|
self._status = raw.strip()
|
|
@@ -406,12 +409,11 @@ class Task(ABC):
|
|
|
406
409
|
except Exception:
|
|
407
410
|
all_ready = False
|
|
408
411
|
|
|
409
|
-
if all_ready:
|
|
412
|
+
if all_ready or self._batch_obj._cleaned_up:
|
|
410
413
|
self._status = Status.SUCCESS.value
|
|
411
|
-
|
|
414
|
+
|
|
412
415
|
else:
|
|
413
416
|
self._status = Status.FAILED.value
|
|
414
|
-
await write_file(status_path, Status.FAILED.value, self.file_semaphore)
|
|
415
417
|
raise ValueError(f"Task {self.id} reported done but outputs are not ready or invalid. {self.expected_outputs['output-file'].expected_file.absolute()}")
|
|
416
418
|
|
|
417
419
|
return self._status
|
|
@@ -526,8 +528,8 @@ class ProfileTaskGenerator(TaskGenerator):
|
|
|
526
528
|
}
|
|
527
529
|
expected_outputs ={
|
|
528
530
|
"profile": FileOutput(row["sample_name"]+".parquet" ),
|
|
529
|
-
"breadth": FileOutput(row["sample_name"]+"_breadth.parquet" ),
|
|
530
531
|
"scaffold": FileOutput(row["sample_name"]+".parquet.scaffolds" ),
|
|
532
|
+
"genome-stats": FileOutput(row["sample_name"]+"_genome_stats.parquet" ),
|
|
531
533
|
}
|
|
532
534
|
task = ProfileBamTask(id=row["sample_name"], inputs=inputs, expected_outputs=expected_outputs, engine=self.engine)
|
|
533
535
|
tasks.append(task)
|
|
@@ -637,9 +639,8 @@ class Batch(ABC):
|
|
|
637
639
|
task.map_io()
|
|
638
640
|
|
|
639
641
|
self._runner_obj:Runner = None
|
|
640
|
-
|
|
642
|
+
self._cleaned_up = False
|
|
641
643
|
|
|
642
|
-
|
|
643
644
|
def _get_initial_status(self) -> str:
|
|
644
645
|
"""Returns the initial status of the batch based on the presence of the batch directory."""
|
|
645
646
|
if not self.batch_dir.exists():
|
|
@@ -663,6 +664,7 @@ class Batch(ABC):
|
|
|
663
664
|
|
|
664
665
|
def cleanup(self) -> None:
|
|
665
666
|
"""The base class defines if any cleanup is needed after batch success. By default, it does nothing."""
|
|
667
|
+
self._cleaned_up = True
|
|
666
668
|
return None
|
|
667
669
|
|
|
668
670
|
@abstractmethod
|
|
@@ -724,27 +726,24 @@ class LocalBatch(Batch):
|
|
|
724
726
|
|
|
725
727
|
async def run(self) -> None:
|
|
726
728
|
"""This method runs all tasks in the batch locally by creating a shell script and executing it."""
|
|
727
|
-
if self.status != Status.SUCCESS
|
|
729
|
+
if self.status != Status.SUCCESS:
|
|
728
730
|
self.batch_dir.mkdir(parents=True, exist_ok=True)
|
|
731
|
+
|
|
729
732
|
self._status = Status.RUNNING.value
|
|
730
|
-
|
|
731
|
-
|
|
732
733
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
733
|
-
|
|
734
|
+
|
|
735
|
+
script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
|
|
736
|
+
script = self._script # Initialize the script content
|
|
737
|
+
|
|
734
738
|
for task in self.tasks:
|
|
735
|
-
if task.status
|
|
739
|
+
if task.status != Status.SUCCESS.value:
|
|
736
740
|
task.task_dir.mkdir(parents=True, exist_ok=True) # Create task directory
|
|
737
741
|
await write_file(task.task_dir / ".status", Status.NOT_STARTED.value, self.file_semaphore)
|
|
738
|
-
|
|
739
|
-
script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
|
|
740
|
-
|
|
741
|
-
script = self._script
|
|
742
|
-
for task in self.tasks:
|
|
743
|
-
if task.status == Status.NOT_STARTED.value or task.status == Status.FAILED.value:
|
|
744
742
|
script += f"\n{task.pre_run}\n{task.command}\n{task.post_run}\n"
|
|
745
743
|
|
|
744
|
+
|
|
745
|
+
|
|
746
746
|
await write_file(script_path, script, self.file_semaphore)
|
|
747
|
-
|
|
748
747
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
749
748
|
|
|
750
749
|
self._proc = await asyncio.create_subprocess_exec(
|
|
@@ -753,12 +752,17 @@ class LocalBatch(Batch):
|
|
|
753
752
|
stderr=asyncio.subprocess.PIPE,
|
|
754
753
|
cwd=self.batch_dir,
|
|
755
754
|
)
|
|
755
|
+
|
|
756
756
|
try:
|
|
757
757
|
out_bytes, err_bytes = await self._proc.communicate()
|
|
758
|
+
|
|
758
759
|
except asyncio.CancelledError:
|
|
759
760
|
if self._proc and self._proc.returncode is None:
|
|
760
761
|
self._proc.terminate()
|
|
762
|
+
await write_file(self.batch_dir / f"{self.id}.err", Messages.CANCELLED_BY_USER, self.file_semaphore)
|
|
763
|
+
|
|
761
764
|
raise
|
|
765
|
+
|
|
762
766
|
|
|
763
767
|
await write_file(self.batch_dir / f"{self.id}.out", out_bytes.decode(), self.file_semaphore)
|
|
764
768
|
await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
|
|
@@ -767,14 +771,14 @@ class LocalBatch(Batch):
|
|
|
767
771
|
self.cleanup()
|
|
768
772
|
self._status = Status.SUCCESS.value
|
|
769
773
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
774
|
+
|
|
770
775
|
else:
|
|
771
776
|
self._status = Status.FAILED.value
|
|
772
777
|
await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
|
|
773
|
-
|
|
774
|
-
elif self.status == Status.SUCCESS.value and self.outputs_ready():
|
|
775
|
-
self._status = Status.SUCCESS.value
|
|
778
|
+
|
|
776
779
|
else:
|
|
777
|
-
self._status = Status.
|
|
780
|
+
self._status = Status.SUCCESS.value
|
|
781
|
+
|
|
778
782
|
|
|
779
783
|
def _parse_job_id(self, sbatch_output):
|
|
780
784
|
return super()._parse_job_id(sbatch_output)
|
|
@@ -1470,15 +1474,11 @@ class ProfileBamTask(Task):
|
|
|
1470
1474
|
zipstrain profile profile-single --bam-file input.bam \
|
|
1471
1475
|
--bed-file bed_file.bed \
|
|
1472
1476
|
--gene-range-table gene-range-table.bed \
|
|
1477
|
+
--stb-file <stb-file> \
|
|
1473
1478
|
--num-workers <num-workers> \
|
|
1474
1479
|
--output-dir .
|
|
1475
1480
|
mv input.bam.parquet <sample-name>.parquet
|
|
1476
1481
|
samtools idxstats <bam-file> | awk '$3 > 0 {print $1}' > <sample-name>.parquet.scaffolds
|
|
1477
|
-
zipstrain utilities genome_breadth_matrix --profile <sample-name>.parquet \
|
|
1478
|
-
--genome-length <genome-length-file> \
|
|
1479
|
-
--stb <stb-file> \
|
|
1480
|
-
--min-cov <breadth-min-cov> \
|
|
1481
|
-
--output-file <sample-name>_breadth.parquet
|
|
1482
1482
|
"""
|
|
1483
1483
|
|
|
1484
1484
|
class FastCompareTask(Task):
|
|
@@ -1552,6 +1552,7 @@ class FastCompareLocalBatch(LocalBatch):
|
|
|
1552
1552
|
for task in tasks_to_remove:
|
|
1553
1553
|
self.tasks.remove(task)
|
|
1554
1554
|
shutil.rmtree(task.task_dir)
|
|
1555
|
+
self._cleaned_up = True
|
|
1555
1556
|
|
|
1556
1557
|
class FastCompareSlurmBatch(SlurmBatch):
|
|
1557
1558
|
"""A SlurmBatch that runs FastCompareTask tasks on a Slurm cluster. Maybe removed in future"""
|
|
@@ -1560,6 +1561,8 @@ class FastCompareSlurmBatch(SlurmBatch):
|
|
|
1560
1561
|
for task in tasks_to_remove:
|
|
1561
1562
|
self.tasks.remove(task)
|
|
1562
1563
|
shutil.rmtree(task.task_dir)
|
|
1564
|
+
|
|
1565
|
+
self._cleaned_up = True
|
|
1563
1566
|
|
|
1564
1567
|
class PrepareCompareGenomeRunOutputsLocalBatch(LocalBatch):
|
|
1565
1568
|
pass
|
|
@@ -1925,8 +1928,11 @@ class FastGeneCompareLocalBatch(LocalBatch):
|
|
|
1925
1928
|
def cleanup(self) -> None:
|
|
1926
1929
|
tasks_to_remove = [task for task in self.tasks if isinstance(task, FastGeneCompareTask)]
|
|
1927
1930
|
for task in tasks_to_remove:
|
|
1931
|
+
task._status=Status.SUCCESS
|
|
1928
1932
|
self.tasks.remove(task)
|
|
1929
1933
|
shutil.rmtree(task.task_dir)
|
|
1934
|
+
self._cleaned_up = True
|
|
1935
|
+
|
|
1930
1936
|
|
|
1931
1937
|
class FastGeneCompareSlurmBatch(SlurmBatch):
|
|
1932
1938
|
"""A SlurmBatch that runs FastGeneCompareTask tasks on a Slurm cluster."""
|
|
@@ -1935,6 +1941,7 @@ class FastGeneCompareSlurmBatch(SlurmBatch):
|
|
|
1935
1941
|
for task in tasks_to_remove:
|
|
1936
1942
|
self.tasks.remove(task)
|
|
1937
1943
|
shutil.rmtree(task.task_dir)
|
|
1944
|
+
self._cleaned_up = True
|
|
1938
1945
|
|
|
1939
1946
|
class PrepareGeneCompareRunOutputsLocalBatch(LocalBatch):
|
|
1940
1947
|
pass
|
|
@@ -140,7 +140,7 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
|
|
|
140
140
|
|
|
141
141
|
if writer is None:
|
|
142
142
|
# Open writer for the first time
|
|
143
|
-
writer = pq.ParquetWriter(output_file, schema, compression='
|
|
143
|
+
writer = pq.ParquetWriter(output_file, schema, compression='zstd')
|
|
144
144
|
writer.write_table(pa.Table.from_batches([batch]))
|
|
145
145
|
|
|
146
146
|
# Clear buffers
|
|
@@ -180,6 +180,51 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
|
|
|
180
180
|
if writer:
|
|
181
181
|
writer.close()
|
|
182
182
|
|
|
183
|
+
def process_read_location(output_file:str, batch_size:int=10000)->None:
|
|
184
|
+
"""
|
|
185
|
+
This function takes the output of samtools view -F 132 and processes it to extract read locations in a parquet file.
|
|
186
|
+
"""
|
|
187
|
+
schema = pa.schema([
|
|
188
|
+
('chrom', pa.string()),
|
|
189
|
+
('pos', pa.int32()),
|
|
190
|
+
])
|
|
191
|
+
writer = None
|
|
192
|
+
chroms = []
|
|
193
|
+
positions = []
|
|
194
|
+
def flush_batch():
|
|
195
|
+
nonlocal writer
|
|
196
|
+
if not chroms:
|
|
197
|
+
return
|
|
198
|
+
batch = pa.RecordBatch.from_arrays([
|
|
199
|
+
pa.array(chroms, type=pa.string()),
|
|
200
|
+
pa.array(positions, type=pa.int32()),
|
|
201
|
+
], schema=schema)
|
|
202
|
+
|
|
203
|
+
if writer is None:
|
|
204
|
+
# Open writer for the first time
|
|
205
|
+
writer = pq.ParquetWriter(output_file, schema, compression='zstd')
|
|
206
|
+
writer.write_table(pa.Table.from_batches([batch]))
|
|
207
|
+
|
|
208
|
+
# Clear buffers
|
|
209
|
+
chroms.clear()
|
|
210
|
+
positions.clear()
|
|
211
|
+
for line in sys.stdin:
|
|
212
|
+
if not line.strip():
|
|
213
|
+
continue
|
|
214
|
+
fields = line.strip().split('\t')
|
|
215
|
+
if len(fields) < 4:
|
|
216
|
+
continue
|
|
217
|
+
chrom, pos = fields[2], fields[3]
|
|
218
|
+
chroms.append(chrom)
|
|
219
|
+
positions.append(int(pos))
|
|
220
|
+
if len(chroms) >= batch_size:
|
|
221
|
+
flush_batch()
|
|
222
|
+
# Flush remaining data
|
|
223
|
+
flush_batch()
|
|
224
|
+
if writer:
|
|
225
|
+
writer.close()
|
|
226
|
+
|
|
227
|
+
|
|
183
228
|
def extract_genome_length(stb: pl.LazyFrame, bed_table: pl.LazyFrame) -> pl.LazyFrame:
|
|
184
229
|
"""
|
|
185
230
|
Extract the genome length information from the scaffold-to-genome mapping table.
|
|
@@ -350,102 +395,106 @@ def split_lf_to_chunks(lf:pl.LazyFrame,num_chunks:int)->list[pl.LazyFrame]:
|
|
|
350
395
|
return chunks
|
|
351
396
|
|
|
352
397
|
|
|
353
|
-
def
|
|
398
|
+
def get_genome_gaps(
|
|
399
|
+
read_loc_table: pl.LazyFrame,
|
|
400
|
+
stb: pl.LazyFrame,
|
|
401
|
+
genome_length: pl.LazyFrame,
|
|
402
|
+
)-> pl.LazyFrame:
|
|
403
|
+
read_loc_table=read_loc_table.sort(["scaffold",'loc'])
|
|
404
|
+
read_loc_table=read_loc_table.with_columns(
|
|
405
|
+
(pl.col("loc") - pl.col("loc").shift(1).over("scaffold")).alias("gap_length")
|
|
406
|
+
).join(
|
|
407
|
+
stb,
|
|
408
|
+
on="scaffold",
|
|
409
|
+
how="left"
|
|
410
|
+
)
|
|
411
|
+
delta=read_loc_table.group_by("genome").agg(
|
|
412
|
+
rn=pl.len()).join(
|
|
413
|
+
genome_length,
|
|
414
|
+
on="genome",
|
|
415
|
+
how="left"
|
|
416
|
+
).with_columns(
|
|
417
|
+
delta=(pl.col("genome_length")/pl.col("rn")).round().alias("delta")).select(
|
|
418
|
+
pl.col("genome"),
|
|
419
|
+
pl.col("delta"),
|
|
420
|
+
pl.col("rn")
|
|
421
|
+
)
|
|
422
|
+
read_loc_table=read_loc_table.join(
|
|
423
|
+
delta,
|
|
424
|
+
on="genome",
|
|
425
|
+
how="left"
|
|
426
|
+
)
|
|
427
|
+
read_loc_table=read_loc_table.filter(
|
|
428
|
+
pl.col("gap_length") > pl.col("delta")
|
|
429
|
+
).group_by(["genome","gap_length"]).agg(
|
|
430
|
+
pd=(pl.len()/(pl.col("rn").first()-1)),
|
|
431
|
+
delta=pl.col("delta").first()
|
|
432
|
+
).with_columns(
|
|
433
|
+
pd= pl.col("pd") * (pl.col("gap_length")-pl.col("delta"))
|
|
434
|
+
).group_by("genome").agg(
|
|
435
|
+
fug=(pl.col("delta").first()-pl.col("pd").sum())/pl.col("delta").first()
|
|
436
|
+
)
|
|
437
|
+
return read_loc_table.select(
|
|
438
|
+
pl.col("genome"),
|
|
439
|
+
pl.col("fug")
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
def get_genome_stats(
|
|
354
443
|
profile:pl.LazyFrame,
|
|
355
444
|
bed: pl.LazyFrame,
|
|
356
445
|
stb: pl.LazyFrame,
|
|
446
|
+
read_loc_table: pl.LazyFrame,
|
|
357
447
|
ber:float=0.5,
|
|
358
|
-
|
|
359
|
-
|
|
448
|
+
fug:float=2,
|
|
449
|
+
min_cov_use_fug:int=0.1
|
|
360
450
|
)->pl.LazyFrame:
|
|
361
|
-
"""
|
|
362
|
-
This function estimates the presence of genomes in a sample based on coverage information.
|
|
363
|
-
as long as the coverage is above a certain threshold. BER is used to decide the threshold.
|
|
364
|
-
However, if the coverage is below the threshold, the coefficient of variation (CV) is used instead as
|
|
365
|
-
a more reliable metric for low-coverage scenarios.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
profile (pl.LazyFrame): The profile LazyFrame containing coverage information.
|
|
369
|
-
bed (pl.LazyFrame): The BED table containing genomic regions.
|
|
370
|
-
stb (pl.LazyFrame): The scaffold-to-bin mapping LazyFrame.
|
|
371
|
-
ber (float): Breadth over expected breadth ratio threshold for genome presence.
|
|
372
|
-
cv_threshold (float): Coefficient of variation threshold for genome presence.
|
|
373
|
-
min_cov_constant_poisson (int): Minimum coverage threshold to use BER for presence estimation.
|
|
374
|
-
|
|
375
|
-
Returns:
|
|
376
|
-
pl.LazyFrame: A LazyFrame containing genome presence information.
|
|
377
|
-
"""
|
|
378
|
-
profile=profile.with_columns(
|
|
379
|
-
(pl.col("A")+pl.col("T")+pl.col("C")+pl.col("G")).alias("coverage")
|
|
380
|
-
)
|
|
381
|
-
starts_df=bed.select(
|
|
382
|
-
pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
|
|
383
|
-
pl.col("start").cast(profile.collect_schema()["pos"]).alias("pos"),
|
|
384
|
-
pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
|
|
385
|
-
pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
|
|
386
|
-
pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
|
|
387
|
-
pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
|
|
388
|
-
pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
|
|
389
|
-
pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
|
|
390
|
-
)
|
|
391
|
-
ends_df=bed.select(
|
|
392
|
-
pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
|
|
393
|
-
(pl.col("end")-1).cast(profile.collect_schema()["pos"]).alias("pos"),
|
|
394
|
-
pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
|
|
395
|
-
pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
|
|
396
|
-
pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
|
|
397
|
-
pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
|
|
398
|
-
pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
|
|
399
|
-
pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
|
|
400
|
-
)
|
|
401
451
|
|
|
402
|
-
|
|
403
|
-
|
|
452
|
+
genome_lengths=extract_genome_length(stb, bed)
|
|
453
|
+
genome_gap_stats= get_genome_gaps(read_loc_table, stb, genome_lengths)
|
|
454
|
+
profile=profile.join(
|
|
404
455
|
stb,
|
|
405
|
-
|
|
456
|
+
left_on="chrom",
|
|
457
|
+
right_on="scaffold",
|
|
406
458
|
how="left"
|
|
407
|
-
).group_by("genome").agg(
|
|
408
|
-
genome_length=(pl.col("end") - pl.col("start")).sum()
|
|
409
459
|
).select(
|
|
460
|
+
pl.col("chrom"),
|
|
410
461
|
pl.col("genome"),
|
|
411
|
-
pl.col("
|
|
462
|
+
(pl.col("A")+pl.col("C")+pl.col("G")+pl.col("T")).alias("coverage")
|
|
412
463
|
)
|
|
413
|
-
profile=profile.
|
|
414
|
-
pl.
|
|
415
|
-
|
|
416
|
-
(pl.col("pos") - pl.col("prev_pos")).clip(lower_bound=1).alias("gap_size")
|
|
464
|
+
profile=profile.group_by("genome").agg(
|
|
465
|
+
total_covered_sites=pl.len(),
|
|
466
|
+
coverage=pl.col("coverage").sum()
|
|
417
467
|
).join(
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
right_on="scaffold",
|
|
468
|
+
genome_lengths,
|
|
469
|
+
on="genome",
|
|
421
470
|
how="left"
|
|
422
|
-
).group_by("genome").agg(
|
|
423
|
-
cv=pl.col("gap_size").filter(pl.col("gap_size") > 1).std()/pl.col("gap_size").filter(pl.col("gap_size") > 1).mean(),
|
|
424
|
-
total_coverage=pl.col("coverage").sum(),
|
|
425
|
-
covered_positions=(pl.col("coverage")>0).sum()
|
|
426
471
|
).join(
|
|
427
|
-
|
|
472
|
+
genome_gap_stats,
|
|
428
473
|
on="genome",
|
|
429
474
|
how="left"
|
|
430
475
|
).with_columns(
|
|
431
|
-
(pl.col("
|
|
432
|
-
(pl.col("
|
|
433
|
-
).select(
|
|
434
|
-
pl.col("genome"),
|
|
435
|
-
pl.col("cv"),
|
|
436
|
-
pl.col("breadth"),
|
|
437
|
-
pl.col("coverage"),
|
|
476
|
+
coverage=(pl.col("coverage")/pl.col("genome_length")),
|
|
477
|
+
breadth=(pl.col("total_covered_sites")/pl.col("genome_length")),
|
|
438
478
|
).with_columns(
|
|
439
|
-
|
|
479
|
+
ber=pl.col("breadth")/(1-(-0.883*pl.col("coverage")).exp()),
|
|
480
|
+
fug=pl.col("fug")
|
|
440
481
|
).with_columns(
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
482
|
+
|
|
483
|
+
pl.when(pl.col("coverage") > min_cov_use_fug)
|
|
484
|
+
.then(
|
|
485
|
+
pl.col("ber") > ber
|
|
445
486
|
).otherwise(
|
|
446
|
-
(pl.col("
|
|
447
|
-
|
|
487
|
+
(pl.col("fug")/0.632 < fug) &
|
|
488
|
+
(pl.col("ber") > ber)
|
|
489
|
+
).fill_null(False).alias("is_present"))
|
|
490
|
+
|
|
491
|
+
return profile.select(
|
|
492
|
+
pl.col("genome"),
|
|
493
|
+
pl.col("coverage"),
|
|
494
|
+
pl.col("breadth"),
|
|
495
|
+
pl.col("ber"),
|
|
496
|
+
pl.col("fug"),
|
|
497
|
+
pl.col("is_present")
|
|
448
498
|
)
|
|
449
499
|
|
|
450
|
-
|
|
451
|
-
|
|
500
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|