PyPI - zipstrain - Versions diffs - 0.2.8__tar.gz → 0.2.16__tar.gz - Mend

zipstrain 0.2.8tar.gz → 0.2.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{zipstrain-0.2.8 → zipstrain-0.2.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: zipstrain
-Version: 0.2.8
+Version: 0.2.16
 Summary:
 Author: ParsaGhadermazi
 Author-email: 54489047+ParsaGhadermazi@users.noreply.github.com

{zipstrain-0.2.8 → zipstrain-0.2.16}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "zipstrain"
-version = "0.2.8"
+version = "0.2.16"
 description = ""
 authors = [
     {name = "ParsaGhadermazi",email = "54489047+ParsaGhadermazi@users.noreply.github.com"}

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/cli.py RENAMED Viewed

@@ -324,43 +324,91 @@ def to_complete_table(genome_comparison_object, output_file):
 @click.option('--profile-file', '-p', required=True, help="Path to the profile Parquet file.")
 @click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
 @click.option('--bed-file', '-b', required=True, help="Path to the BED file.")
-@click.option('--min-cov', '-c', default=0.1, help="Minimum genome-wide coverage to use homogeneous Poisson point process.")
-@click.option('--ber', '-e', default=0.5, help="Minimum breadth to expected breadth ratio to consider a genome as present.")
-@click.option('--cv-threshold', '-v', default=1.0, help="Maximum coefficient of variation to consider genome as present when coverage is smaller than min-cov.")
+@click.option('--read-loc-file', '-r', required=True, help="Path to the read location table.")
+@click.option('--min-cov-fug', '-c', default=0.1, help="Minimum coverage to use fug.")
+@click.option('--fug-threshold', '-f', default=2, help="FUG threshold.")
+@click.option('--ber', '-e', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
 @click.option('--output-file', '-o', required=True, help="Path to save the output Parquet file.")
-def presence_profile(profile_file, stb_file, bed_file, min_cov, ber, cv_threshold, output_file):
+def presence_profile(profile_file, stb_file, bed_file, read_loc_file, min_cov_fug, fug_threshold, ber, output_file):
     """
-    Generate a presence profile from the given files.
+    Generate a presence profile for genomes based on the given profile and read location data.
     Args:
     profile_file (str): Path to the profile Parquet file.
     stb_file (str): Path to the scaffold-to-genome mapping file.
     bed_file (str): Path to the BED file.
+    read_loc_file (str): Path to the read location table.
+    min_cov_fug (float): Minimum coverage to use fug.
+    fug_threshold (float): FUG threshold.
+    ber (float): Minimum ratio of breadth over expected breadth to consider presence.
+    output_file (str): Path to save the output Parquet file.
     """
-    profile=pl.scan_parquet(profile_file)
+    profile = pl.scan_parquet(profile_file)
     stb = pl.scan_csv(stb_file, separator="\t", has_header=False).with_columns(
         pl.col("column_1").alias("scaffold"),
         pl.col("column_2").alias("genome")
     ).select(["scaffold", "genome"])
     bed = pl.scan_csv(bed_file, separator="\t", has_header=False).with_columns(
         pl.col("column_1").alias("scaffold"),
-        pl.col("column_2").alias("start"),
-        pl.col("column_3").alias("end")
+        pl.col("column_2").cast(pl.Int64).alias("start"),
+        pl.col("column_3").cast(pl.Int64).alias("end")
     ).select(["scaffold", "start", "end"])
-    ut.estimate_genome_presence(
+    read_loc_table = pl.scan_parquet(read_loc_file).rename({
+        "chrom":"scaffold",
+        "pos":"loc"
+    })
+    presence_df = ut.get_genome_stats(
         profile=profile,
         stb=stb,
         bed=bed,
-        cv_threshold=cv_threshold,
-        ber=ber,
-        min_cov_constant_poisson=min_cov
-    ).sink_parquet(output_file, compression='zstd',engine="streaming")
+        read_loc_table=read_loc_table,
+        min_cov_use_fug=min_cov_fug,
+        fug=fug_threshold,
+        ber=ber
+    )
+    presence_df.sink_parquet(output_file, compression='zstd')
+@utilities.command("process-read-locs")
+@click.option("--output-file", "-o", required=True, help="Path to save the processed read locations Parquet file.")
+def process_read_locs(output_file):
+    """
+    Process read locations and save them to a Parquet file.
+    Args:
+    output_file (str): Path to save the output Parquet file.
+    """
+    ut.process_read_location(output_file=pathlib.Path(output_file))
 @cli.group()
 def gene_tools():
     """Holds anything related to gene analysis."""
     pass
+@utilities.command("generate_stb")
+@click.option('--genomes-dir-file', '-g', required=True, help="Path to the genomes directory file. A text file with each line containing a genome fasta file path.")
+@click.option('--output-file', '-o', required=True, help="Path to save the output scaffold-to-genome mapping file.")
+@click.option('--extension', '-e', default=".fasta", help="File extension of the genome fasta files.")
+def generate_stb(genomes_dir_file, output_file, extension):
+    """
+    Generate a scaffold-to-genome mapping file from the given genomes directory file.
+    Args:
+    genomes_dir_file (str): Path to the genomes directory file.
+    output_file (str): Path to save the output scaffold-to-genome mapping file.
+    extension (str): File extension of the genome fasta files.
+    """
+    with open(output_file, 'w') as out_f:
+        for genome in pathlib.Path(genomes_dir_file).glob(f"*{extension}"):
+            genome_name = genome.stem
+            with open(genome, 'r') as gf:
+                for line in gf:
+                    if line.startswith('>'):
+                        scaffold_name = line[1:].strip().split()[0]
+                        out_f.write(f"{scaffold_name}\t{genome_name}\n")
 @gene_tools.command("gene-range-table")
 @click.option('--gene-file', '-g', required=True, help="location of gene file. Prodigal's nucleotide fasta output")
@@ -603,22 +651,34 @@ def prepare_profiling(reference_fasta, gene_fasta, stb_file, output_dir):
 @profile.command("profile-single")
 @click.option('--bed-file', '-b', required=True, help="Path to the BED file describing regions to be profiled.")
 @click.option('--bam-file', '-a', required=True, help="Path to the BAM file to be profiled.")
+@click.option('--stb-file', '-s', required=True, help="Path to the scaffold-to-genome mapping file.")
 @click.option('--gene-range-table', '-g', required=True, help="Path to the gene range table.")
 @click.option('--num-workers', '-n', default=1, help="Number of workers to use for profiling.")
 @click.option('--output-dir', '-o', required=True, help="Directory to save the profiling output.")
-def profile_single(bed_file, bam_file, gene_range_table, num_workers, output_dir):
+@click.option('--ber', '-r', default=0.5, help="Minimum ratio of breadth over expected breadth to consider presence.")
+@click.option('--fug', '-f', default=2.0, help="fraction of expected gaps (FUG) threshold.")
+@click.option('--min-cov-use-fug', '-m', default=0.1, help="Minimum coverage to use FUG.")
+def profile_single(bed_file, bam_file, stb_file, gene_range_table, num_workers, output_dir, ber, fug, min_cov_use_fug):
     """
     Profile a single BAM file using the provided BED file and gene range table.
     """
     output_dir=pathlib.Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
+    stb= pl.scan_csv(stb_file, separator='\t',has_header=False).with_columns(
+        pl.col("column_1").alias("scaffold"),
+        pl.col("column_2").alias("genome")
+    )
     pf.profile_bam(
         bed_file=bed_file,
         bam_file=bam_file,
         gene_range_table=gene_range_table,
+        stb=stb,
         output_dir=output_dir,
-        num_workers=num_workers
+        num_workers=num_workers,
+        ber=ber,
+        fug=fug,
+        min_cov_use_fug=min_cov_use_fug
     )
 @cli.group()

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/profile.py RENAMED Viewed

@@ -147,13 +147,27 @@ async def _profile_chunk_task(
     stdout, stderr = await proc.communicate()
     if proc.returncode != 0:
         raise Exception(f"Command failed with error: {stderr.decode().strip()}")
+    cmd=["samtools", "view", "-F", "132", "-L", str(bed_file.absolute()), str(bam_file.absolute()), "|", "zipstrain", "utilities", "process-read-locs", "--output-file", f"{bam_file.stem}_read_locs_{chunk_id}.parquet"]
+    proc = await asyncio.create_subprocess_shell(
+                " ".join(cmd),
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                cwd=output_dir
+            )
+    stdout, stderr = await proc.communicate()
+    if proc.returncode != 0:
+        raise Exception(f"Command failed with error: {stderr.decode().strip()}")
 async def profile_bam_in_chunks(
     bed_file:str,
     bam_file:str,
     gene_range_table:str,
+    stb:pl.LazyFrame,
     output_dir:str,
-    num_workers:int=4
+    num_workers:int=4,
+    ber:float=0.5,
+    fug:float=2.0,
+    min_cov_use_fug:int=0.1
 )->None:
     """
     Profile a BAM file in chunks using provided BED files.
@@ -189,17 +203,46 @@ async def profile_bam_in_chunks(
             chunk_id=chunk_id
         ))
     await asyncio.gather(*tasks)
-    pfs=[output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet" for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
-    mpileup_df = pl.concat([pl.scan_parquet(pf) for pf in pfs])
-    mpileup_df.sink_parquet(output_dir/f"{bam_file.stem}.parquet", compression='zstd')
+    pfs=[(output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet", output_dir/"tmp"/f"{bam_file.stem}_read_locs_{chunk_id}.parquet" ) for chunk_id in range(len(bed_chunk_files)) if (output_dir/"tmp"/f"{bam_file.stem}_{chunk_id}.parquet").exists()]
+    mpile_container: list[pl.LazyFrame] = []
+    read_loc_pfs: list[pl.LazyFrame] = []
+    for pf, read_loc_pf in pfs:
+        mpile_container.append(pl.scan_parquet(pf).lazy())
+        read_loc_pfs.append(pl.scan_parquet(read_loc_pf).lazy())
+    mpileup_df = pl.concat(mpile_container)
+    mpileup_df.sink_parquet(output_dir/f"{bam_file.stem}_profile.parquet", compression='zstd', engine='streaming')
+    read_loc_df = pl.concat(read_loc_pfs).rename(
+        {
+            "chrom":"scaffold",
+            "pos":"loc",
+        }
+    )
+    utils.get_genome_stats(
+        profile=mpileup_df,
+        read_loc_table=read_loc_df,
+        stb=stb,
+        bed=bed_lf.rename({"column_1":"scaffold","column_2":"start","column_3":"end"}),
+        ber=ber,
+        fug=fug,
+        min_cov_use_fug=min_cov_use_fug,
+    ).sink_parquet(output_dir/f"{bam_file.stem}_genome_stats.parquet", compression='zstd', engine='streaming')
     os.system(f"rm -r {output_dir}/tmp")
 def profile_bam(
     bed_file:str,
     bam_file:str,
     gene_range_table:str,
+    stb:pl.LazyFrame,
     output_dir:str,
-    num_workers:int=4
+    num_workers:int=4,
+    ber:float=0.5,
+    fug:float=2.0,
+    min_cov_use_fug:int=0.1
 )->None:
     """
     Profile a BAM file in chunks using provided BED files.
@@ -208,6 +251,7 @@ def profile_bam(
     bed_file (list[pathlib.Path]): A bed file describing all regions to be profiled.
     bam_file (pathlib.Path): Path to the BAM file.
     gene_range_table (pathlib.Path): Path to the gene range table.
+    stb (pl.LazyFrame): Scaffold-to-bin mapping table.
     output_dir (pathlib.Path): Directory to save output files.
     num_workers (int): Number of concurrent workers to use.
     """
@@ -215,7 +259,11 @@ def profile_bam(
         bed_file=bed_file,
         bam_file=bam_file,
         gene_range_table=gene_range_table,
+        stb=stb,
         output_dir=output_dir,
-        num_workers=num_workers
+        num_workers=num_workers,
+        ber=ber,
+        fug=fug,
+        min_cov_use_fug=min_cov_use_fug
     ))

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/task_manager.py RENAMED Viewed

@@ -127,6 +127,9 @@ class Status(StrEnum):
     SUCCESS = "success"
     PENDING = "pending"
+class Messages(StrEnum):
+    """Enumeration of common messages used in task and batch management."""
+    CANCELLED_BY_USER = "Task was cancelled by a signal from the user."
 class Input(ABC):
     """Abstract base class for task inputs. DO NOT INSTANTIATE DIRECTLY.
@@ -275,7 +278,6 @@ class IntOutput(Output):
             raise ValueError(f"Output value for task {self.task.id} is not an integer.")
         else:
             return False
-        return False
 class Engine(ABC):
@@ -391,6 +393,7 @@ class Task(ABC):
         """Asynchronously reads the task status from the .status file in the task directory."""
         status_path = self.task_dir / ".status"
         # read the status file if it exists
         if status_path.exists():
             raw = await read_file(status_path, self.file_semaphore)
             self._status = raw.strip()
@@ -406,12 +409,11 @@ class Task(ABC):
                 except Exception:
                     all_ready = False
-                if all_ready:
+                if all_ready or self._batch_obj._cleaned_up:
                     self._status = Status.SUCCESS.value
-                    await write_file(status_path, Status.SUCCESS.value, self.file_semaphore)
                 else:
                     self._status = Status.FAILED.value
-                    await write_file(status_path, Status.FAILED.value, self.file_semaphore)
                     raise ValueError(f"Task {self.id} reported done but outputs are not ready or invalid. {self.expected_outputs['output-file'].expected_file.absolute()}")
         return self._status
@@ -521,13 +523,13 @@ class ProfileTaskGenerator(TaskGenerator):
                 "bed-file": FileInput(self.profile_bed_file),
                 "gene-range-table": FileInput(self.gene_range_file),
                 "genome-length-file": FileInput(self.genome_length_file),
-                "num-threads": IntInput(self.num_procs),
+                "num-workers": IntInput(self.num_procs),
                 "breadth-min-cov": IntInput(self.breadth_min_cov),
                 }
                 expected_outputs ={
                 "profile":  FileOutput(row["sample_name"]+".parquet" ),
-                "breadth":  FileOutput(row["sample_name"]+"_breadth.parquet" ),
                 "scaffold": FileOutput(row["sample_name"]+".parquet.scaffolds" ),
+                "genome-stats": FileOutput(row["sample_name"]+"_genome_stats.parquet" ),
                 }
                 task = ProfileBamTask(id=row["sample_name"], inputs=inputs, expected_outputs=expected_outputs, engine=self.engine)
                 tasks.append(task)
@@ -637,9 +639,8 @@ class Batch(ABC):
             task.map_io()
         self._runner_obj:Runner = None
+        self._cleaned_up = False
     def _get_initial_status(self) -> str:
         """Returns the initial status of the batch based on the presence of the batch directory."""
         if not self.batch_dir.exists():
@@ -663,6 +664,7 @@ class Batch(ABC):
     def cleanup(self) -> None:
         """The base class defines if any cleanup is needed after batch success. By default, it does nothing."""
+        self._cleaned_up = True
         return None
     @abstractmethod
@@ -718,33 +720,30 @@ class LocalBatch(Batch):
     def __init__(self, tasks, id, run_dir, expected_outputs) -> None:
         super().__init__(tasks, id, run_dir, expected_outputs)
-        self._script = self.TEMPLATE_CMD + "\nset -o pipefail\n"
+        self._script = self.TEMPLATE_CMD + "\nset -euo pipefail\n"
         self._proc: asyncio.subprocess.Process | None = None
     async def run(self) -> None:
         """This method runs all tasks in the batch locally by creating a shell script and executing it."""
-        if self.status != Status.SUCCESS and self.status != Status.FAILED.value:
+        if self.status != Status.SUCCESS:
             self.batch_dir.mkdir(parents=True, exist_ok=True)
             self._status = Status.RUNNING.value
             await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
+            script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
+            script = self._script # Initialize the script content
             for task in self.tasks:
-                if task.status == Status.NOT_STARTED.value:
+                if task.status != Status.SUCCESS.value:
                     task.task_dir.mkdir(parents=True, exist_ok=True)  # Create task directory
                     await write_file(task.task_dir / ".status", Status.NOT_STARTED.value, self.file_semaphore)
-            script_path = self.batch_dir / f"{self.id}.sh" # Path to the shell script for the batch
-            script = self._script
-            for task in self.tasks:
-                if task.status == Status.NOT_STARTED.value or task.status == Status.FAILED.value:
                     script += f"\n{task.pre_run}\n{task.command}\n{task.post_run}\n"
             await write_file(script_path, script, self.file_semaphore)
             await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
             self._proc = await asyncio.create_subprocess_exec(
@@ -753,28 +752,37 @@ class LocalBatch(Batch):
                 stderr=asyncio.subprocess.PIPE,
                 cwd=self.batch_dir,
             )
             try:
                 out_bytes, err_bytes = await self._proc.communicate()
             except asyncio.CancelledError:
                 if self._proc and self._proc.returncode is None:
                     self._proc.terminate()
-                raise
+                    await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
+                raise Exception
             await write_file(self.batch_dir / f"{self.id}.out", out_bytes.decode(), self.file_semaphore)
             await write_file(self.batch_dir / f"{self.id}.err", err_bytes.decode(), self.file_semaphore)
+            if self._proc.returncode != 0:
+                error=err_bytes.decode()
+                raise RuntimeError(f"Batch script failed with error:\n{error}")
             if self._proc.returncode == 0 and self.outputs_ready():
                 self.cleanup()
                 self._status = Status.SUCCESS.value
                 await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
             else:
                 self._status = Status.FAILED.value
                 await write_file(self.batch_dir / ".status", self._status, self.file_semaphore)
-        elif self.status == Status.SUCCESS.value and self.outputs_ready():
-            self._status = Status.SUCCESS.value
         else:
-            self._status = Status.FAILED.value
+            self._status = Status.SUCCESS.value
     def _parse_job_id(self, sbatch_output):
         return super()._parse_job_id(sbatch_output)
@@ -810,7 +818,7 @@ class SlurmBatch(Batch):
         super().__init__(tasks, id, run_dir, expected_outputs)
         self._check_slurm_works()
         self.slurm_config = slurm_config
-        self._script = self.TEMPLATE_CMD + self.slurm_config.to_slurm_args() + "\nset -o pipefail\n"
+        self._script = self.TEMPLATE_CMD + self.slurm_config.to_slurm_args() + "\nset -euo pipefail\n"
         self._job_id = None
     def _check_slurm_works(self) -> None:
@@ -1450,7 +1458,7 @@ class ProfileBamTask(Task):
         - gene-range-table: A BED file specifying the gene ranges for the sample.
-        - num-threads: The number of threads to use for processing.
+        - num-workers: The number of concurrent workers to use for processing.
         - genome-length-file: A file containing the lengths of the genomes in the reference fasta.
@@ -1470,15 +1478,12 @@ class ProfileBamTask(Task):
     zipstrain profile profile-single --bam-file input.bam \
     --bed-file bed_file.bed \
     --gene-range-table gene-range-table.bed \
+    --stb-file <stb-file> \
     --num-workers <num-workers> \
     --output-dir .
-    mv input.bam.parquet <sample-name>.parquet
+    mv input_profile.parquet <sample-name>.parquet
+    mv input_genome_stats.parquet <sample-name>_genome_stats.parquet
     samtools idxstats <bam-file> |  awk '$3 > 0 {print $1}' > <sample-name>.parquet.scaffolds
-    zipstrain utilities genome_breadth_matrix --profile <sample-name>.parquet \
-        --genome-length <genome-length-file> \
-        --stb <stb-file> \
-        --min-cov <breadth-min-cov> \
-        --output-file <sample-name>_breadth.parquet
     """
 class FastCompareTask(Task):
@@ -1552,6 +1557,7 @@ class FastCompareLocalBatch(LocalBatch):
         for task in tasks_to_remove:
             self.tasks.remove(task)
             shutil.rmtree(task.task_dir)
+        self._cleaned_up = True
 class FastCompareSlurmBatch(SlurmBatch):
     """A SlurmBatch that runs FastCompareTask tasks on a Slurm cluster. Maybe removed in future"""
@@ -1560,6 +1566,8 @@ class FastCompareSlurmBatch(SlurmBatch):
         for task in tasks_to_remove:
             self.tasks.remove(task)
             shutil.rmtree(task.task_dir)
+        self._cleaned_up = True
 class PrepareCompareGenomeRunOutputsLocalBatch(LocalBatch):
     pass
@@ -1925,8 +1933,11 @@ class FastGeneCompareLocalBatch(LocalBatch):
     def cleanup(self) -> None:
         tasks_to_remove = [task for task in self.tasks if isinstance(task, FastGeneCompareTask)]
         for task in tasks_to_remove:
+            task._status=Status.SUCCESS
             self.tasks.remove(task)
             shutil.rmtree(task.task_dir)
+        self._cleaned_up = True
 class FastGeneCompareSlurmBatch(SlurmBatch):
     """A SlurmBatch that runs FastGeneCompareTask tasks on a Slurm cluster."""
@@ -1935,6 +1946,7 @@ class FastGeneCompareSlurmBatch(SlurmBatch):
         for task in tasks_to_remove:
             self.tasks.remove(task)
             shutil.rmtree(task.task_dir)
+        self._cleaned_up = True
 class PrepareGeneCompareRunOutputsLocalBatch(LocalBatch):
     pass

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/utils.py RENAMED Viewed

@@ -140,7 +140,7 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
         if writer is None:
             # Open writer for the first time
-            writer = pq.ParquetWriter(output_file, schema, compression='snappy')
+            writer = pq.ParquetWriter(output_file, schema, compression='zstd')
         writer.write_table(pa.Table.from_batches([batch]))
         # Clear buffers
@@ -180,6 +180,51 @@ def process_mpileup_function(gene_range_table_loc, batch_bed, batch_size, output
     if writer:
         writer.close()
+def process_read_location(output_file:str, batch_size:int=10000)->None:
+    """
+    This function takes the output of samtools view -F 132 and processes it to extract read locations in a parquet file.
+    """
+    schema = pa.schema([
+        ('chrom', pa.string()),
+        ('pos', pa.int32()),
+    ])
+    writer = None
+    chroms = []
+    positions = []
+    def flush_batch():
+        nonlocal writer
+        if not chroms:
+            return
+        batch = pa.RecordBatch.from_arrays([
+            pa.array(chroms, type=pa.string()),
+            pa.array(positions, type=pa.int32()),
+        ], schema=schema)
+        if writer is None:
+            # Open writer for the first time
+            writer = pq.ParquetWriter(output_file, schema, compression='zstd')
+        writer.write_table(pa.Table.from_batches([batch]))
+        # Clear buffers
+        chroms.clear()
+        positions.clear()
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        fields = line.strip().split('\t')
+        if len(fields) < 4:
+            continue
+        chrom, pos = fields[2], fields[3]
+        chroms.append(chrom)
+        positions.append(int(pos))
+        if len(chroms) >= batch_size:
+            flush_batch()
+    # Flush remaining data
+    flush_batch()
+    if writer:
+        writer.close()
 def extract_genome_length(stb: pl.LazyFrame, bed_table: pl.LazyFrame) -> pl.LazyFrame:
     """
     Extract the genome length information from the scaffold-to-genome mapping table.
@@ -350,102 +395,106 @@ def split_lf_to_chunks(lf:pl.LazyFrame,num_chunks:int)->list[pl.LazyFrame]:
     return chunks
-def estimate_genome_presence(
+def get_genome_gaps(
+    read_loc_table: pl.LazyFrame,
+    stb: pl.LazyFrame,
+    genome_length: pl.LazyFrame,
+                    )-> pl.LazyFrame:
+    read_loc_table=read_loc_table.sort(["scaffold",'loc'])
+    read_loc_table=read_loc_table.with_columns(
+        (pl.col("loc") - pl.col("loc").shift(1).over("scaffold")).alias("gap_length")
+    ).join(
+        stb,
+        on="scaffold",
+        how="left"
+    )
+    delta=read_loc_table.group_by("genome").agg(
+        rn=pl.len()).join(
+        genome_length,
+        on="genome",
+        how="left"
+    ).with_columns(
+        delta=(pl.col("genome_length")/pl.col("rn")).round().alias("delta")).select(
+        pl.col("genome"),
+        pl.col("delta"),
+        pl.col("rn")
+        )
+    read_loc_table=read_loc_table.join(
+        delta,
+        on="genome",
+        how="left"
+    )
+    read_loc_table=read_loc_table.filter(
+        pl.col("gap_length") > pl.col("delta")
+    ).group_by(["genome","gap_length"]).agg(
+        pd=(pl.len()/(pl.col("rn").first()-1)),
+        delta=pl.col("delta").first()
+    ).with_columns(
+        pd= pl.col("pd") * (pl.col("gap_length")-pl.col("delta"))
+    ).group_by("genome").agg(
+        fug=(pl.col("delta").first()-pl.col("pd").sum())/pl.col("delta").first()
+    )
+    return read_loc_table.select(
+        pl.col("genome"),
+        pl.col("fug")
+    )
+def get_genome_stats(
     profile:pl.LazyFrame,
     bed: pl.LazyFrame,
     stb: pl.LazyFrame,
+    read_loc_table: pl.LazyFrame,
     ber:float=0.5,
-    cv_threshold:float=2.5,
-    min_cov_constant_poisson: int = 0.5,
+    fug:float=2,
+    min_cov_use_fug:int=0.1
 )->pl.LazyFrame:
-    """
-    This function estimates the presence of genomes in a sample based on coverage information.
-    as long as the coverage is above a certain threshold. BER is used to decide the threshold.
-    However, if the coverage is below the threshold, the coefficient of variation (CV) is used instead as
-    a more reliable metric for low-coverage scenarios.
-    Args:
-        profile (pl.LazyFrame): The profile LazyFrame containing coverage information.
-        bed (pl.LazyFrame): The BED table containing genomic regions.
-        stb (pl.LazyFrame): The scaffold-to-bin mapping LazyFrame.
-        ber (float): Breadth over expected breadth ratio threshold for genome presence.
-        cv_threshold (float): Coefficient of variation threshold for genome presence.
-        min_cov_constant_poisson (int): Minimum coverage threshold to use BER for presence estimation.
-    Returns:
-        pl.LazyFrame: A LazyFrame containing genome presence information.
-    """
-    profile=profile.with_columns(
-        (pl.col("A")+pl.col("T")+pl.col("C")+pl.col("G")).alias("coverage")
-        )
-    starts_df=bed.select(
-        pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
-        pl.col("start").cast(profile.collect_schema()["pos"]).alias("pos"),
-        pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
-        pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
-        pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
-        pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
-        pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
-        pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
-        )
-    ends_df=bed.select(
-        pl.col("scaffold").cast(profile.collect_schema()["chrom"]).alias("chrom"),
-        (pl.col("end")-1).cast(profile.collect_schema()["pos"]).alias("pos"),
-        pl.lit("NA").cast(profile.collect_schema()["gene"]).alias("gene"),
-        pl.lit(0).cast(profile.collect_schema()["A"]).alias("A"),
-        pl.lit(0).cast(profile.collect_schema()["T"]).alias("T"),
-        pl.lit(0).cast(profile.collect_schema()["C"]).alias("C"),
-        pl.lit(0).cast(profile.collect_schema()["G"]).alias("G"),
-        pl.lit(0).cast(profile.collect_schema()["coverage"]).alias("coverage")
-        )
-    profile=pl.concat([profile,starts_df,ends_df]).unique(subset=["chrom","pos"],keep="first").sort(["chrom","pos"])
-    genome_lengths=bed.join(
+    genome_lengths=extract_genome_length(stb, bed)
+    genome_gap_stats= get_genome_gaps(read_loc_table, stb, genome_lengths)
+    profile=profile.join(
         stb,
-        on="scaffold",
+        left_on="chrom",
+        right_on="scaffold",
         how="left"
-    ).group_by("genome").agg(
-        genome_length=(pl.col("end") - pl.col("start")).sum()
     ).select(
+        pl.col("chrom"),
         pl.col("genome"),
-        pl.col("genome_length")
+        (pl.col("A")+pl.col("C")+pl.col("G")+pl.col("T")).alias("coverage")
     )
-    profile=profile.with_columns(
-        pl.col("pos").shift(1).fill_null(0).over("chrom").alias("prev_pos"),
-    ).with_columns(
-        (pl.col("pos") - pl.col("prev_pos")).clip(lower_bound=1).alias("gap_size")
+    profile=profile.group_by("genome").agg(
+        total_covered_sites=pl.len(),
+        coverage=pl.col("coverage").sum()
     ).join(
-        stb,
-        left_on="chrom",
-        right_on="scaffold",
+        genome_lengths,
+        on="genome",
         how="left"
-    ).group_by("genome").agg(
-        cv=pl.col("gap_size").filter(pl.col("gap_size") > 1).std()/pl.col("gap_size").filter(pl.col("gap_size") > 1).mean(),
-        total_coverage=pl.col("coverage").sum(),
-        covered_positions=(pl.col("coverage")>0).sum()
     ).join(
-        genome_lengths,
+        genome_gap_stats,
         on="genome",
         how="left"
     ).with_columns(
-        (pl.col("covered_positions")/pl.col("genome_length")).alias("breadth"),
-        (pl.col("total_coverage")/pl.col("genome_length")).alias("coverage"),
-    ).select(
-        pl.col("genome"),
-        pl.col("cv"),
-        pl.col("breadth"),
-        pl.col("coverage"),
+        coverage=(pl.col("coverage")/pl.col("genome_length")),
+        breadth=(pl.col("total_covered_sites")/pl.col("genome_length")),
     ).with_columns(
-        (pl.col("breadth")/(1-(-0.883*pl.col("coverage")).exp())).alias("ber"),
+        ber=pl.col("breadth")/(1-(-0.883*pl.col("coverage")).exp()),
+        fug=pl.col("fug")
     ).with_columns(
-        pl.when(
-            pl.col("coverage") >= min_cov_constant_poisson
-        ).then(
-            pl.col("ber") >= ber
+    pl.when(pl.col("coverage") > min_cov_use_fug)
+    .then(
+        pl.col("ber") > ber
         ).otherwise(
-            (pl.col("cv") <= cv_threshold)  & (~pl.col("ber").is_nan())
-        ).alias("is_present")
+            (pl.col("fug")/0.632 < fug) &
+            (pl.col("ber") > ber)
+        ).fill_null(False).alias("is_present"))
+    return profile.select(
+        pl.col("genome"),
+        pl.col("coverage"),
+        pl.col("breadth"),
+        pl.col("ber"),
+        pl.col("fug"),
+        pl.col("is_present")
     )
-    return profile

{zipstrain-0.2.8 → zipstrain-0.2.16}/README.md RENAMED Viewed

File without changes

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/__init__.py RENAMED Viewed

File without changes

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/compare.py RENAMED Viewed

File without changes

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/database.py RENAMED Viewed

File without changes

{zipstrain-0.2.8 → zipstrain-0.2.16}/src/zipstrain/visualize.py RENAMED Viewed

File without changes

zipstrain 0.2.8__tar.gz → 0.2.16__tar.gz

zipstrain 0.2.8tar.gz → 0.2.16tar.gz