zipstrain 0.2.8__tar.gz → 0.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: zipstrain
3
- Version: 0.2.8
3
+ Version: 0.2.16
4
4
  Summary:
5
5
  Author: ParsaGhadermazi
6
6
  Author-email: 54489047+ParsaGhadermazi@users.noreply.github.com
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "zipstrain"
3
- version = "0.2.8"
3
+ version = "0.2.16"
4
4
  description = ""
5
5
  authors = [
6
6
  {name = "ParsaGhadermazi",email = "54489047+ParsaGhadermazi@users.noreply.github.com"}
@@ -324,43 +324,91 @@ def to_complete_table(genome_comparison_object, output_file):
324
324
  @click.option('--profile-file', '-p', required=True, help="Path to the profile Parquet file.")
325
325
  @click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
326
326
  @click.option('--bed-file', '-b', required=True, help="Path to the BED file.")
327
- @click.option('--min-cov', '-c', default=0.1, help="Minimum genome-wide coverage to use homogeneous Poisson point process.")
328
- @click.option('--ber', '-e', default=0.5, help="Minimum breadth to expected breadth ratio to consider a genome as present.")
329
- @click.option('--cv-threshold', '-v', default=1.0, help="Maximum coefficient of variation to consider genome as present when coverage is smaller than min-cov.")
327
+ @click.option('--read-loc-file', '-r', required=True, help="Path to the read location table.")
328
+ @click.option('--min-cov-fug', '-c', default=0.1, help="Minimum coverage to use fug.")
329
+ @click.option('--fug-threshold', '-f', default=2, help="FUG threshold.")
330
+ @click.option('--ber', '-e', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
330
331
  @click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
331
- def presence_profile(profile_file, stb_file, bed_file, min_cov, ber, cv_threshold, output_file):
332
+ def presence_profile(profile_file, stb_file, bed_file, read_loc_file, min_cov_fug, fug_threshold, ber, output_file):
332
333
  """
333
- Generate a presence profile from the given files.
334
+ Generate a presence profile for genomes based on the given profile and read location data.
334
335
 
335
336
  Args:
336
337
  profile_file (str): Path to the profile Parquet file.
337
338
  stb_file (str): Path to the scaffold-to-genome mapping file.
338
339
  bed_file (str): Path to the BED file.
340
+ read_loc_file (str): Path to the read location table.
341
+ min_cov_fug (float): Minimum coverage to use fug.
342
+ fug_threshold (float): FUG threshold.
343
+ ber (float): Minimum ratio of breadth over expected breadth to consider presence.
344
+ output_file (str): Path to save the output Parquet file.
339
345
  """
340
- profile=pl.scan_parquet(profile_file)
346
+ profile = pl.scan_parquet(profile_file)
341
347
  stb = pl.scan_csv(stb_file, separator="\t", has_header=False).with_columns(
342
348
  pl.col("column_1").alias("scaffold"),
343
349
  pl.col("column_2").alias("genome")
344
350
  ).select(["scaffold", "genome"])
345
351
  bed = pl.scan_csv(bed_file, separator="\t", has_header=False).with_columns(
346
352
  pl.col("column_1").alias("scaffold"),
347
- pl.col("column_2").alias("start"),
348
- pl.col("column_3").alias("end")
353
+ pl.col("column_2").cast(pl.Int64).alias("start"),
354
+ pl.col("column_3").cast(pl.Int64).alias("end")
349
355
  ).select(["scaffold", "start", "end"])
350
- ut.estimate_genome_presence(
356
+ read_loc_table = pl.scan_parquet(read_loc_file).rename({
357
+ "chrom":"scaffold",
358
+ "pos":"loc"
359
+ })
360
+ presence_df = ut.get_genome_stats(
351
361
  profile=profile,
352
362
  stb=stb,
353
363
  bed=bed,
354
- cv_threshold=cv_threshold,
355
- ber=ber,
356
- min_cov_constant_poisson=min_cov
357
- ).sink_parquet(output_file, compression='zstd',engine="streaming")
364
+ read_loc_table=read_loc_table,
365
+ min_cov_use_fug=min_cov_fug,
366
+ fug=fug_threshold,
367
+ ber=ber
368
+ )
369
+ presence_df.sink_parquet(output_file, compression='zstd')
370
+
371
+ @utilities.command("process-read-locs")
372
+ @click.option("--output-file", "-o", required=True, help="Path to save the processed read locations Parquet file.")
373
+ def process_read_locs(output_file):
374
+ """
375
+ Process read locations and save them to a Parquet file.
376
+
377
+ Args:
378
+ output_file (str): Path to save the output Parquet file.
379
+ """
380
+ ut.process_read_location(output_file=pathlib.Path(output_file))
358
381
 
359
382
  @cli.group()
360
383
  def gene_tools():
361
384
  """Holds anything related to gene analysis."""
362
385
  pass
363
386
 
387
+ @utilities.command("generate_stb")
388
+ @click.option('--genomes-dir-file', '-g', required=True, help="Path to the genomes directory file. A text file with each line containing a genome fasta file path.")
389
+ @click.option('--output-file', '-o', required=True, help="Path to save the output scaffold-to-genome mapping file.")
390
+ @click.option('--extension', '-e', default=".fasta", help="File extension of the genome fasta files.")
391
+ def generate_stb(genomes_dir_file, output_file, extension):
392
+ """
393
+ Generate a scaffold-to-genome mapping file from the given genomes directory file.
394
+
395
+ Args:
396
+ genomes_dir_file (str): Path to the genomes directory file.
397
+ output_file (str): Path to save the output scaffold-to-genome mapping file.
398
+ extension (str): File extension of the genome fasta files.
399
+ """
400
+ with open(output_file, 'w') as out_f:
401
+ for genome in pathlib.Path(genomes_dir_file).glob(f"*{extension}"):
402
+ genome_name = genome.stem
403
+ with open(genome, 'r') as gf:
404
+ for line in gf:
405
+ if line.startswith('>'):
406
+ scaffold_name = line[1:].strip().split()[0]
407
+ out_f.write(f"{scaffold_name}\t{genome_name}\n")
408
+
409
+
410
+
411
+
364
412
 
365
413
  @gene_tools.command("gene-range-table")
366
414
  @click.option('--gene-file', '-g', required=True, help="location of gene file. Prodigal's nucleotide fasta output")
@@ -603,22 +651,34 @@ def prepare_profiling(reference_fasta, gene_fasta, stb_file, output_dir):
603
651
  @profile.command("profile-single")
604
652
  @click.option('--bed-file', '-b', required=True, help="Path to the BED file describing regions to be profiled.")
605
653
  @click.option('--bam-file', '-a', required=True, help="Path to the BAM file to be profiled.")
654
+ @click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
606
655
  @click.option('--gene-range-table', '-g', required=True, help="Path to the gene range table.")
607
656
  @click.option('--num-workers', '-n', default=1, help="Number of workers to use for profiling.")
608
657
  @click.option('--output-dir', '-o', required=True, help="Directory to save the profiling output.")
609
- def profile_single(bed_file, bam_file, gene_range_table, num_workers, output_dir):
658
+ @click.option('--ber', '-r', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
659
+ @click.option('--fug', '-f', default=2.0, help="fraction of expected gaps (FUG) threshold.")
660
+ @click.option('--min-cov-use-fug', '-m', default=0.1, help="Minimum coverage to use FUG.")
661
+ def profile_single(bed_file, bam_file, stb_file, gene_range_table, num_workers, output_dir, ber, fug, min_cov_use_fug):
610
662
  """
611
663
  Profile a single BAM file using the provided BED file and gene range table.
612
664
 
613
665
  """
614
666
  output_dir=pathlib.Path(output_dir)
615
667
  output_dir.mkdir(parents=True, exist_ok=True)
668
+ stb= pl.scan_csv(stb_file, separator='\t',has_header=False).with_columns(
669
+ pl.col("column_1").alias("scaffold"),
670
+ pl.col("column_2").alias("genome")
671
+ )
616
672
  pf.profile_bam(
617
673
  bed_file=bed_file,
618
674
  bam_file=bam_file,
619
675
  gene_range_table=gene_range_table,
676
+ stb=stb,
620
677
  output_dir=output_dir,
621
- num_workers=num_workers
678
+ num_workers=num_workers,
679
+ ber=ber,
680
+ fug=fug,
681
+ min_cov_use_fug=min_cov_use_fug
622
682
  )
623
683
 
624
684
  @cli.group()
@@ -147,13 +147,27 @@ async def _profile_chunk_task(
147
147
  stdout, stderr = await proc.communicate()
148
148
  if proc.returncode != 0:
149
149
  raise Exception(f"Command failed with error: {stderr.decode().strip()}")
150
+ cmd=["samtools", "view", "-F", "132", "-L", str(bed_file.absolute()), str(bam_file.absolute()), "|", "zipstrain", "utilities", "process-read-locs", "--output-file", f"{bam_file.stem}_read_locs_{chunk_id}.parquet"]
151
+ proc = await asyncio.create_subprocess_shell(
152
+ " ".join(cmd),
153
+ stdout=asyncio.subprocess.PIPE,
154
+ stderr=asyncio.subprocess.PIPE,
155
+ cwd=output_dir
156
+ )
157
+ stdout, stderr = await proc.communicate()
158
+ if proc.returncode != 0:
159
+ raise Exception(f"Command failed with error: {stderr.decode().strip()}")
150
160
 
151
161
  async def profile_bam_in_chunks(
152
162
  bed_file:str,
153
163
  bam_file:str,
154
164
  gene_range_table:str,
165
+ stb:pl.LazyFrame,
155
166
  output_dir:str,
156
- num_workers:int=4
167
+ num_workers:int=4,
168
+ ber:float=0.5,
169
+ fug:float=2.0,
170
+ min_cov_use_fug:int=0.1
157
171
  )->None:
158
172
  """
159
173
  Profile a BAM file in chunks using provided BED files.
@@ -189,17 +203,46 @@ async def profile_bam_in_chunks(
189
203
  chunk_id=chunk_id
190
204
  ))
191
205
  await asyncio.gather(*tasks)
192
- pfs=[output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet" for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
193
- mpileup_df = pl.concat([pl.scan_parquet(pf) for pf in pfs])
194
- mpileup_df.sink_parquet(output_dir/f"{bam_file.stem}.parquet", compression='zstd')
206
+ pfs=[(output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet", output_dir/"tmp"/f"{bam_file.stem}_read_locs_{chunk_id}.parquet" ) for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
207
+
208
+ mpile_container: list[pl.LazyFrame] = []
209
+ read_loc_pfs: list[pl.LazyFrame] = []
210
+ for pf, read_loc_pf in pfs:
211
+ mpile_container.append(pl.scan_parquet(pf).lazy())
212
+ read_loc_pfs.append(pl.scan_parquet(read_loc_pf).lazy())
213
+
214
+ mpileup_df = pl.concat(mpile_container)
215
+ mpileup_df.sink_parquet(output_dir/f"{bam_file.stem}_profile.parquet", compression='zstd', engine='streaming')
216
+ read_loc_df = pl.concat(read_loc_pfs).rename(
217
+ {
218
+ "chrom":"scaffold",
219
+ "pos":"loc",
220
+ }
221
+ )
222
+
223
+ utils.get_genome_stats(
224
+ profile=mpileup_df,
225
+ read_loc_table=read_loc_df,
226
+ stb=stb,
227
+ bed=bed_lf.rename({"column_1":"scaffold","column_2":"start","column_3":"end"}),
228
+ ber=ber,
229
+ fug=fug,
230
+ min_cov_use_fug=min_cov_use_fug,
231
+ ).sink_parquet(output_dir/f"{bam_file.stem}_genome_stats.parquet", compression='zstd', engine='streaming')
232
+
233
+
195
234
  os.system(f"rm -r {output_dir}/tmp")
196
235
 
197
236
  def profile_bam(
198
237
  bed_file:str,
199
238
  bam_file:str,
200
239
  gene_range_table:str,
240
+ stb:pl.LazyFrame,
201
241
  output_dir:str,
202
- num_workers:int=4
242
+ num_workers:int=4,
243
+ ber:float=0.5,
244
+ fug:float=2.0,
245
+ min_cov_use_fug:int=0.1
203
246
  )->None:
204
247
  """
205
248
  Profile a BAM file in chunks using provided BED files.
@@ -208,6 +251,7 @@ def profile_bam(
208
251
  bed_file (list[pathlib.Path]): A bed file describing all regions to be profiled.
209
252
  bam_file (pathlib.Path): Path to the BAM file.
210
253
  gene_range_table (pathlib.Path): Path to the gene range table.
254
+ stb (pl.LazyFrame): Scaffold-to-bin mapping table.
211
255
  output_dir (pathlib.Path): Directory to save output files.
212
256
  num_workers (int): Number of concurrent workers to use.
213
257
  """
@@ -215,7 +259,11 @@ def profile_bam(
215
259
  bed_file=bed_file,
216
260
  bam_file=bam_file,
217
261
  gene_range_table=gene_range_table,
262
+ stb=stb,
218
263
  output_dir=output_dir,
219
- num_workers=num_workers
264
+ num_workers=num_workers,
265
+ ber=ber,
266
+ fug=fug,
267
+ min_cov_use_fug=min_cov_use_fug
220
268
  ))
221
269
 
@@ -127,6 +127,9 @@ class Status(StrEnum):
127
127
  SUCCESS = "success"
128
128
  PENDING = "pending"
129
129
 
130
+ class Messages(StrEnum):
131
+ """Enumeration of common messages used in task and batch management."""
132
+ CANCELLED_BY_USER = "Task was cancelled by a signal from the user."
130
133
 
131
134
  class Input(ABC):
132
135
  """Abstract base class for task inputs. DO NOT INSTANTIATE DIRECTLY.
@@ -275,7 +278,6 @@ class IntOutput(Output):
275
278
  raise ValueError(f"Output value for task {self.task.id} is not an integer.")
276
279
  else:
277
280
  return False
278
- return False
279
281
 
280
282
 
281
283
  class Engine(ABC):
@@ -391,6 +393,7 @@ class Task(ABC):
391
393
  """Asynchronously reads the task status from the .status file in the task directory."""
392
394
  status_path = self.task_dir / ".status"
393
395
  # read the status file if it exists
396
+
394
397
  if status_path.exists():
395
398
  raw = await read_file(status_path, self.file_semaphore)
396
399
  self._status = raw.strip()
@@ -406,12 +409,11 @@ class Task(ABC):
406
409
  except Exception:
407
410
  all_ready = False
408
411
 
409
- if all_ready:
412
+ if all_ready or self._batch_obj._cleaned_up:
410
413
  self._status = Status.SUCCESS.value
411
- await write_file(status_path, Status.SUCCESS.value, self.file_semaphore)
414
+
412
415
  else:
413
416
  self._status = Status.FAILED.value
414
- await write_file(status_path, Status.FAILED.value, self.file_semaphore)
415
417
  raise ValueError(f"Task {self.id} reported done but outputs are not ready or invalid. {self.expected_outputs['output-file'].expected_file.absolute()}")
416
418
 
417
419
  return self._status
@@ -521,13 +523,13 @@ class ProfileTaskGenerator(TaskGenerator):
521
523
  "bed-file": FileInput(self.profile_bed_file),
522
524
  "gene-range-table": FileInput(self.gene_range_file),
523
525
  "genome-length-file": FileInput(self.genome_length_file),
524
- "num-threads": IntInput(self.num_procs),
526
+ "num-workers": IntInput(self.num_procs),
525
527
  "breadth-min-cov": IntInput(self.breadth_min_cov),
526
528
  }
527
529
  expected_outputs ={
528
530
  "profile": FileOutput(row["sample_name"]+".parquet" ),
529
- "breadth": FileOutput(row["sample_name"]+"_breadth.parquet" ),
530
531
  "scaffold": FileOutput(row["sample_name"]+".parquet.scaffolds" ),
532
+ "genome-stats": FileOutput(row["sample_name"]+"_genome_stats.parquet" ),
531
533
  }
532
534
  task = ProfileBamTask(id=row["sample_name"], inputs=inputs, expected_outputs=expected_outputs, engine=self.engine)
533
535
  tasks.append(task)
@@ -637,9 +639,8 @@ class Batch(ABC):
637
639
  task.map_io()
638
640
 
639
641
  self._runner_obj:Runner = None
640
-
642
+ self._cleaned_up = False
641
643
 
642
-
643
644
  def _get_initial_status(self) -> str:
644
645
  """Returns the initial status of the batch based on the presence of the batch directory."""
645
646
  if not self.batch_dir.exists():
@@ -663,6 +664,7 @@ class Batch(ABC):
663
664
 
664
665
  def cleanup(self) -> None:
665
666
  """The base class defines if any cleanup is needed after batch success. By default, it does nothing."""
667
+ self._cleaned_up = True
666
668
  return None
667
669
 
668
670
  @abstractmethod
@@ -718,33 +720,30 @@ class LocalBatch(Batch):
718
720
 
719
721
  def __init__(self, tasks, id, run_dir, expected_outputs) -> None:
720
722
  super().__init__(tasks, id, run_dir, expected_outputs)
721
- self._script = self.TEMPLATE_CMD + "\nset -o pipefail\n"
723
+ self._script = self.TEMPLATE_CMD + "\nset -euo pipefail\n"
722
724
  self._proc: asyncio.subprocess.Process | None = None
723
725
 
724
726
 
725
727
  async def run(self) -> None:
726
728
  """This method runs all tasks in the batch locally by creating a shell script and executing it."""
727
- if self.status != Status.SUCCESS and self.status != Status.FAILED.value:
729
+ if self.status != Status.SUCCESS:
728
730
  self.batch_dir.mkdir(parents=True, exist_ok=True)
731
+
729
732
  self._status = Status.RUNNING.value
730
-
731
-
732
733
  await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
733
-
734
+
735
+ script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
736
+ script = self._script # Initialize the script content
737
+
734
738
  for task in self.tasks:
735
- if task.status == Status.NOT_STARTED.value:
739
+ if task.status != Status.SUCCESS.value:
736
740
  task.task_dir.mkdir(parents=True, exist_ok=True) # Create task directory
737
741
  await write_file(task.task_dir / ".status", Status.NOT_STARTED.value, self.file_semaphore)
738
-
739
- script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
740
-
741
- script = self._script
742
- for task in self.tasks:
743
- if task.status == Status.NOT_STARTED.value or task.status == Status.FAILED.value:
744
742
  script += f"\n{task.pre_run}\n{task.command}\n{task.post_run}\n"
745
743
 
744
+
745
+
746
746
  await write_file(script_path, script, self.file_semaphore)
747
-
748
747
  await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
749
748
 
750
749
  self._proc = await asyncio.create_subprocess_exec(
@@ -753,28 +752,37 @@ class LocalBatch(Batch):
753
752
  stderr=asyncio.subprocess.PIPE,
754
753
  cwd=self.batch_dir,
755
754
  )
755
+
756
756
  try:
757
757
  out_bytes, err_bytes = await self._proc.communicate()
758
+
758
759
  except asyncio.CancelledError:
759
760
  if self._proc and self._proc.returncode is None:
760
761
  self._proc.terminate()
761
- raise
762
+ await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
763
+
764
+ raise Exception
765
+
762
766
 
763
767
  await write_file(self.batch_dir / f"{self.id}.out", out_bytes.decode(), self.file_semaphore)
764
768
  await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
765
769
 
770
+ if self._proc.returncode != 0:
771
+ error=err_bytes.decode()
772
+ raise RuntimeError(f"Batch script failed with error:\n{error}")
773
+
766
774
  if self._proc.returncode == 0 and self.outputs_ready():
767
775
  self.cleanup()
768
776
  self._status = Status.SUCCESS.value
769
777
  await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
778
+
770
779
  else:
771
780
  self._status = Status.FAILED.value
772
781
  await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
773
-
774
- elif self.status == Status.SUCCESS.value and self.outputs_ready():
775
- self._status = Status.SUCCESS.value
782
+
776
783
  else:
777
- self._status = Status.FAILED.value
784
+ self._status = Status.SUCCESS.value
785
+
778
786
 
779
787
  def _parse_job_id(self, sbatch_output):
780
788
  return super()._parse_job_id(sbatch_output)
@@ -810,7 +818,7 @@ class SlurmBatch(Batch):
810
818
  super().__init__(tasks, id, run_dir, expected_outputs)
811
819
  self._check_slurm_works()
812
820
  self.slurm_config = slurm_config
813
- self._script = self.TEMPLATE_CMD + self.slurm_config.to_slurm_args() + "\nset -o pipefail\n"
821
+ self._script = self.TEMPLATE_CMD + self.slurm_config.to_slurm_args() + "\nset -euo pipefail\n"
814
822
  self._job_id = None
815
823
 
816
824
  def _check_slurm_works(self) -> None:
@@ -1450,7 +1458,7 @@ class ProfileBamTask(Task):
1450
1458
 
1451
1459
  - gene-range-table: A BED file specifying the gene ranges for the sample.
1452
1460
 
1453
- - num-threads: The number of threads to use for processing.
1461
+ - num-workers: The number of concurrent workers to use for processing.
1454
1462
 
1455
1463
  - genome-length-file: A file containing the lengths of the genomes in the reference fasta.
1456
1464
 
@@ -1470,15 +1478,12 @@ class ProfileBamTask(Task):
1470
1478
  zipstrain profile profile-single --bam-file input.bam \
1471
1479
  --bed-file bed_file.bed \
1472
1480
  --gene-range-table gene-range-table.bed \
1481
+ --stb-file <stb-file> \
1473
1482
  --num-workers <num-workers> \
1474
1483
  --output-dir .
1475
- mv input.bam.parquet <sample-name>.parquet
1484
+ mv input_profile.parquet <sample-name>.parquet
1485
+ mv input_genome_stats.parquet <sample-name>_genome_stats.parquet
1476
1486
  samtools idxstats <bam-file> | awk '$3 > 0 {print $1}' > <sample-name>.parquet.scaffolds
1477
- zipstrain utilities genome_breadth_matrix --profile <sample-name>.parquet \
1478
- --genome-length <genome-length-file> \
1479
- --stb <stb-file> \
1480
- --min-cov <breadth-min-cov> \
1481
- --output-file <sample-name>_breadth.parquet
1482
1487
  """
1483
1488
 
1484
1489
  class FastCompareTask(Task):
@@ -1552,6 +1557,7 @@ class FastCompareLocalBatch(LocalBatch):
1552
1557
  for task in tasks_to_remove:
1553
1558
  self.tasks.remove(task)
1554
1559
  shutil.rmtree(task.task_dir)
1560
+ self._cleaned_up = True
1555
1561
 
1556
1562
  class FastCompareSlurmBatch(SlurmBatch):
1557
1563
  """A SlurmBatch that runs FastCompareTask tasks on a Slurm cluster. Maybe removed in future"""
@@ -1560,6 +1566,8 @@ class FastCompareSlurmBatch(SlurmBatch):
1560
1566
  for task in tasks_to_remove:
1561
1567
  self.tasks.remove(task)
1562
1568
  shutil.rmtree(task.task_dir)
1569
+
1570
+ self._cleaned_up = True
1563
1571
 
1564
1572
  class PrepareCompareGenomeRunOutputsLocalBatch(LocalBatch):
1565
1573
  pass
@@ -1925,8 +1933,11 @@ class FastGeneCompareLocalBatch(LocalBatch):
1925
1933
  def cleanup(self) -> None:
1926
1934
  tasks_to_remove = [task for task in self.tasks if isinstance(task, FastGeneCompareTask)]
1927
1935
  for task in tasks_to_remove:
1936
+ task._status=Status.SUCCESS
1928
1937
  self.tasks.remove(task)
1929
1938
  shutil.rmtree(task.task_dir)
1939
+ self._cleaned_up = True
1940
+
1930
1941
 
1931
1942
  class FastGeneCompareSlurmBatch(SlurmBatch):
1932
1943
  """A SlurmBatch that runs FastGeneCompareTask tasks on a Slurm cluster."""
@@ -1935,6 +1946,7 @@ class FastGeneCompareSlurmBatch(SlurmBatch):
1935
1946
  for task in tasks_to_remove:
1936
1947
  self.tasks.remove(task)
1937
1948
  shutil.rmtree(task.task_dir)
1949
+ self._cleaned_up = True
1938
1950
 
1939
1951
  class PrepareGeneCompareRunOutputsLocalBatch(LocalBatch):
1940
1952
  pass
@@ -140,7 +140,7 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
140
140
 
141
141
  if writer is None:
142
142
  # Open writer for the first time
143
- writer = pq.ParquetWriter(output_file, schema, compression='snappy')
143
+ writer = pq.ParquetWriter(output_file, schema, compression='zstd')
144
144
  writer.write_table(pa.Table.from_batches([batch]))
145
145
 
146
146
  # Clear buffers
@@ -180,6 +180,51 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
180
180
  if writer:
181
181
  writer.close()
182
182
 
183
+ def process_read_location(output_file:str, batch_size:int=10000)->None:
184
+ """
185
+ This function takes the output of samtools view -F 132 and processes it to extract read locations in a parquet file.
186
+ """
187
+ schema = pa.schema([
188
+ ('chrom', pa.string()),
189
+ ('pos', pa.int32()),
190
+ ])
191
+ writer = None
192
+ chroms = []
193
+ positions = []
194
+ def flush_batch():
195
+ nonlocal writer
196
+ if not chroms:
197
+ return
198
+ batch = pa.RecordBatch.from_arrays([
199
+ pa.array(chroms, type=pa.string()),
200
+ pa.array(positions, type=pa.int32()),
201
+ ], schema=schema)
202
+
203
+ if writer is None:
204
+ # Open writer for the first time
205
+ writer = pq.ParquetWriter(output_file, schema, compression='zstd')
206
+ writer.write_table(pa.Table.from_batches([batch]))
207
+
208
+ # Clear buffers
209
+ chroms.clear()
210
+ positions.clear()
211
+ for line in sys.stdin:
212
+ if not line.strip():
213
+ continue
214
+ fields = line.strip().split('\t')
215
+ if len(fields) < 4:
216
+ continue
217
+ chrom, pos = fields[2], fields[3]
218
+ chroms.append(chrom)
219
+ positions.append(int(pos))
220
+ if len(chroms) >= batch_size:
221
+ flush_batch()
222
+ # Flush remaining data
223
+ flush_batch()
224
+ if writer:
225
+ writer.close()
226
+
227
+
183
228
  def extract_genome_length(stb: pl.LazyFrame, bed_table: pl.LazyFrame) -> pl.LazyFrame:
184
229
  """
185
230
  Extract the genome length information from the scaffold-to-genome mapping table.
@@ -350,102 +395,106 @@ def split_lf_to_chunks(lf:pl.LazyFrame,num_chunks:int)->list[pl.LazyFrame]:
350
395
  return chunks
351
396
 
352
397
 
353
- def estimate_genome_presence(
398
+ def get_genome_gaps(
399
+ read_loc_table: pl.LazyFrame,
400
+ stb: pl.LazyFrame,
401
+ genome_length: pl.LazyFrame,
402
+ )-> pl.LazyFrame:
403
+ read_loc_table=read_loc_table.sort(["scaffold",'loc'])
404
+ read_loc_table=read_loc_table.with_columns(
405
+ (pl.col("loc") - pl.col("loc").shift(1).over("scaffold")).alias("gap_length")
406
+ ).join(
407
+ stb,
408
+ on="scaffold",
409
+ how="left"
410
+ )
411
+ delta=read_loc_table.group_by("genome").agg(
412
+ rn=pl.len()).join(
413
+ genome_length,
414
+ on="genome",
415
+ how="left"
416
+ ).with_columns(
417
+ delta=(pl.col("genome_length")/pl.col("rn")).round().alias("delta")).select(
418
+ pl.col("genome"),
419
+ pl.col("delta"),
420
+ pl.col("rn")
421
+ )
422
+ read_loc_table=read_loc_table.join(
423
+ delta,
424
+ on="genome",
425
+ how="left"
426
+ )
427
+ read_loc_table=read_loc_table.filter(
428
+ pl.col("gap_length") > pl.col("delta")
429
+ ).group_by(["genome","gap_length"]).agg(
430
+ pd=(pl.len()/(pl.col("rn").first()-1)),
431
+ delta=pl.col("delta").first()
432
+ ).with_columns(
433
+ pd= pl.col("pd") * (pl.col("gap_length")-pl.col("delta"))
434
+ ).group_by("genome").agg(
435
+ fug=(pl.col("delta").first()-pl.col("pd").sum())/pl.col("delta").first()
436
+ )
437
+ return read_loc_table.select(
438
+ pl.col("genome"),
439
+ pl.col("fug")
440
+ )
441
+
442
+ def get_genome_stats(
354
443
  profile:pl.LazyFrame,
355
444
  bed: pl.LazyFrame,
356
445
  stb: pl.LazyFrame,
446
+ read_loc_table: pl.LazyFrame,
357
447
  ber:float=0.5,
358
- cv_threshold:float=2.5,
359
- min_cov_constant_poisson: int = 0.5,
448
+ fug:float=2,
449
+ min_cov_use_fug:int=0.1
360
450
  )->pl.LazyFrame:
361
- """
362
- This function estimates the presence of genomes in a sample based on coverage information.
363
- as long as the coverage is above a certain threshold. BER is used to decide the threshold.
364
- However, if the coverage is below the threshold, the coefficient of variation (CV) is used instead as
365
- a more reliable metric for low-coverage scenarios.
366
-
367
- Args:
368
- profile (pl.LazyFrame): The profile LazyFrame containing coverage information.
369
- bed (pl.LazyFrame): The BED table containing genomic regions.
370
- stb (pl.LazyFrame): The scaffold-to-bin mapping LazyFrame.
371
- ber (float): Breadth over expected breadth ratio threshold for genome presence.
372
- cv_threshold (float): Coefficient of variation threshold for genome presence.
373
- min_cov_constant_poisson (int): Minimum coverage threshold to use BER for presence estimation.
374
-
375
- Returns:
376
- pl.LazyFrame: A LazyFrame containing genome presence information.
377
- """
378
- profile=profile.with_columns(
379
- (pl.col("A")+pl.col("T")+pl.col("C")+pl.col("G")).alias("coverage")
380
- )
381
- starts_df=bed.select(
382
- pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
383
- pl.col("start").cast(profile.collect_schema()["pos"]).alias("pos"),
384
- pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
385
- pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
386
- pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
387
- pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
388
- pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
389
- pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
390
- )
391
- ends_df=bed.select(
392
- pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
393
- (pl.col("end")-1).cast(profile.collect_schema()["pos"]).alias("pos"),
394
- pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
395
- pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
396
- pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
397
- pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
398
- pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
399
- pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
400
- )
401
451
 
402
- profile=pl.concat([profile,starts_df,ends_df]).unique(subset=["chrom","pos"],keep="first").sort(["chrom","pos"])
403
- genome_lengths=bed.join(
452
+ genome_lengths=extract_genome_length(stb, bed)
453
+ genome_gap_stats= get_genome_gaps(read_loc_table, stb, genome_lengths)
454
+ profile=profile.join(
404
455
  stb,
405
- on="scaffold",
456
+ left_on="chrom",
457
+ right_on="scaffold",
406
458
  how="left"
407
- ).group_by("genome").agg(
408
- genome_length=(pl.col("end") - pl.col("start")).sum()
409
459
  ).select(
460
+ pl.col("chrom"),
410
461
  pl.col("genome"),
411
- pl.col("genome_length")
462
+ (pl.col("A")+pl.col("C")+pl.col("G")+pl.col("T")).alias("coverage")
412
463
  )
413
- profile=profile.with_columns(
414
- pl.col("pos").shift(1).fill_null(0).over("chrom").alias("prev_pos"),
415
- ).with_columns(
416
- (pl.col("pos") - pl.col("prev_pos")).clip(lower_bound=1).alias("gap_size")
464
+ profile=profile.group_by("genome").agg(
465
+ total_covered_sites=pl.len(),
466
+ coverage=pl.col("coverage").sum()
417
467
  ).join(
418
- stb,
419
- left_on="chrom",
420
- right_on="scaffold",
468
+ genome_lengths,
469
+ on="genome",
421
470
  how="left"
422
- ).group_by("genome").agg(
423
- cv=pl.col("gap_size").filter(pl.col("gap_size") > 1).std()/pl.col("gap_size").filter(pl.col("gap_size") > 1).mean(),
424
- total_coverage=pl.col("coverage").sum(),
425
- covered_positions=(pl.col("coverage")>0).sum()
426
471
  ).join(
427
- genome_lengths,
472
+ genome_gap_stats,
428
473
  on="genome",
429
474
  how="left"
430
475
  ).with_columns(
431
- (pl.col("covered_positions")/pl.col("genome_length")).alias("breadth"),
432
- (pl.col("total_coverage")/pl.col("genome_length")).alias("coverage"),
433
- ).select(
434
- pl.col("genome"),
435
- pl.col("cv"),
436
- pl.col("breadth"),
437
- pl.col("coverage"),
476
+ coverage=(pl.col("coverage")/pl.col("genome_length")),
477
+ breadth=(pl.col("total_covered_sites")/pl.col("genome_length")),
438
478
  ).with_columns(
439
- (pl.col("breadth")/(1-(-0.883*pl.col("coverage")).exp())).alias("ber"),
479
+ ber=pl.col("breadth")/(1-(-0.883*pl.col("coverage")).exp()),
480
+ fug=pl.col("fug")
440
481
  ).with_columns(
441
- pl.when(
442
- pl.col("coverage") >= min_cov_constant_poisson
443
- ).then(
444
- pl.col("ber") >= ber
482
+
483
+ pl.when(pl.col("coverage") > min_cov_use_fug)
484
+ .then(
485
+ pl.col("ber") > ber
445
486
  ).otherwise(
446
- (pl.col("cv") <= cv_threshold) & (~pl.col("ber").is_nan())
447
- ).alias("is_present")
487
+ (pl.col("fug")/0.632 < fug) &
488
+ (pl.col("ber") > ber)
489
+ ).fill_null(False).alias("is_present"))
490
+
491
+ return profile.select(
492
+ pl.col("genome"),
493
+ pl.col("coverage"),
494
+ pl.col("breadth"),
495
+ pl.col("ber"),
496
+ pl.col("fug"),
497
+ pl.col("is_present")
448
498
  )
449
499
 
450
- return profile
451
-
500
+
File without changes