PyPI - panxpress - Versions diffs - 0.2__tar.gz - Mend

panxpress 0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of panxpress might be problematic. Click here for more details.

Files changed (53) hide show

panxpress-0.2/LICENSE +21 -0
panxpress-0.2/MANIFEST.in +5 -0
panxpress-0.2/PKG-INFO +374 -0
panxpress-0.2/README.md +327 -0
panxpress-0.2/panxpress/cuckoo_filter_utils.py +207 -0
panxpress-0.2/panxpress/dnaencode.py +277 -0
panxpress-0.2/panxpress/dnaencode_fast.py +156 -0
panxpress-0.2/panxpress/fastcash_info.py +156 -0
panxpress-0.2/panxpress/fastcash_main.py +271 -0
panxpress-0.2/panxpress/fastcash_weak_ptr.py +489 -0
panxpress-0.2/panxpress/hash_new.py +742 -0
panxpress-0.2/panxpress/hashfunctions.py +221 -0
panxpress-0.2/panxpress/io/binaryio.py +62 -0
panxpress-0.2/panxpress/io/fastaio.py +461 -0
panxpress-0.2/panxpress/io/fastqio.py +596 -0
panxpress-0.2/panxpress/io/filterio.py +112 -0
panxpress-0.2/panxpress/io/generaldsio.py +46 -0
panxpress-0.2/panxpress/io/generalio.py +252 -0
panxpress-0.2/panxpress/io/hashio.py +514 -0
panxpress-0.2/panxpress/io/seqio.py +200 -0
panxpress-0.2/panxpress/io/textio.py +21 -0
panxpress-0.2/panxpress/io/xorio.py +94 -0
panxpress-0.2/panxpress/kmers.py +474 -0
panxpress-0.2/panxpress/lowlevel/aligned_arrays.py +42 -0
panxpress-0.2/panxpress/lowlevel/bitarray.py +228 -0
panxpress-0.2/panxpress/lowlevel/conpro.py +504 -0
panxpress-0.2/panxpress/lowlevel/debug.py +97 -0
panxpress-0.2/panxpress/lowlevel/intbitarray.py +252 -0
panxpress-0.2/panxpress/lowlevel/libc.py +174 -0
panxpress-0.2/panxpress/lowlevel/llvm.py +638 -0
panxpress-0.2/panxpress/lowlevel/lowlevelfunctions.txt +1 -0
panxpress-0.2/panxpress/lowlevel/numbautils.py +25 -0
panxpress-0.2/panxpress/lowlevel/packedarray.py +186 -0
panxpress-0.2/panxpress/mask.py +69 -0
panxpress-0.2/panxpress/mathutils.py +296 -0
panxpress-0.2/panxpress/panxpress/config/index.yaml +7 -0
panxpress-0.2/panxpress/panxpress/panxpress_build_reference.py +475 -0
panxpress-0.2/panxpress/panxpress/panxpress_correct_gff.py +1342 -0
panxpress-0.2/panxpress/panxpress/panxpress_index.py +286 -0
panxpress-0.2/panxpress/panxpress/panxpress_main.py +308 -0
panxpress-0.2/panxpress/panxpress/panxpress_map_parallel.py +480 -0
panxpress-0.2/panxpress/parameters.py +63 -0
panxpress-0.2/panxpress/srhash.py +594 -0
panxpress-0.2/panxpress/subtable_hashfunctions.py +395 -0
panxpress-0.2/panxpress/values/panxpress.py +87 -0
panxpress-0.2/panxpress.egg-info/PKG-INFO +374 -0
panxpress-0.2/panxpress.egg-info/SOURCES.txt +51 -0
panxpress-0.2/panxpress.egg-info/dependency_links.txt +1 -0
panxpress-0.2/panxpress.egg-info/entry_points.txt +3 -0
panxpress-0.2/panxpress.egg-info/requires.txt +5 -0
panxpress-0.2/panxpress.egg-info/top_level.txt +1 -0
panxpress-0.2/pyproject.toml +55 -0
panxpress-0.2/setup.cfg +4 -0

panxpress-0.2/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026- Inês Alves Ferreira, Jens Zentgraf, Johanna Elena Schmitz & Sven Rahmann
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

panxpress-0.2/MANIFEST.in ADDED Viewed

@@ -0,0 +1,5 @@
+include panxpress/panxpress/*
+include panxpress/io/*
+include panxpress/lowlevel/*
+include panxpress/values/*
+include panxpress/panxpress/config/*

panxpress-0.2/PKG-INFO ADDED Viewed

@@ -0,0 +1,374 @@
+Metadata-Version: 2.4
+Name: panxpress
+Version: 0.2
+Summary: Gene expression quantification with a pan-transcriptomic gapped k-mer index
+Author-email: Inês Alves Ferreira <zentgraf@cs.uni-saarland.de>, Jens Zentgraf <zentgraf@cs.uni-saarland.de>, Johanna Elena Schmitz <jschmitz@cs.uni-saarland.de>, Sven Rahmann <rahmann@cs.uni-saarland.de>
+License: MIT License
+        Copyright (c) 2026- Inês Alves Ferreira, Jens Zentgraf, Johanna Elena Schmitz & Sven Rahmann
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://gitlab.com/rahmannlab/panxpress
+Project-URL: Bug Tracker, https://gitlab.com/rahmannlab/panxpress/issues
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Science/Research
+Classifier: Natural Language :: English
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: numba
+Requires-Dist: pytest
+Requires-Dist: jsonargparse>=4.29.0
+Requires-Dist: pip
+Dynamic: license-file
+![](logo.png)
+# PanXpress: Gene Expression Quantification with a Pan-Transcriptomic Gapped K-mer Index
+PanXpress is a unified framework for bacterial pan-transcriptomics that:
+- Corrects ambiguous annotations in GFF files using a two-step algorithm
+- Constructs a pan-transcriptomic reference FASTA file from genomic FASTA and corrected GFF annotation files
+- Builds a gapped k-mer index over the pan-transcriptomic reference
+- Supports alignment-free mapping of reads to genes from FASTQ samples
+- Quantifies gene expression across strains
+In case of problems, please file an issue in the issue tracker.
+See `CHANGELOG.md` for recent changes.
+---
+# Usage Guide
+To use PanXpress you will need the following data and files:
+- **GFF annotation files** and **genome FASTA files** for the bacterial strains to be included in the reference index. For a given strain, the annotation and genome files must share the same filename (differing only in their extension).
+- **FASTQ samples** for mapping.
+Additionally, for the correction of annotation files where similar proteins are grouped into gene groups, protein FASTA files can be provided by the user. If these are not available, they can be generated by PanXpress using the `agat` tool.
+The typical workflow proceeds in five steps: (1) correct the GFF annotation files, (2) build the pan-transcriptomic reference, (3) construct the index, (4) map reads, and (5) convert the raw reads counts to transcripts per million.
+---
+## Installation Guide
+To run the software, a conda environment with the required libraries needs to be created.
+A list of needed libraries is provided in the environment.yml file in the repository. You can create the environment with the following command:
+```bash
+conda env create
+```
+An environment with the name `panxpress` will be created. To activate the environment and install the package from the repository, run:
+```bash
+conda activate panxpress
+pip install -e .
+```
+## Examples
+To better understand how to run PanXpress, we included into this repository a folder `reads` with a few simulated reads from a mixture of 3 strains of pseudomonas aerugionosa (both single and paired end) and a folder `ref`, that contains `pa_2_strains`, with the **GFF** annotation files, the genome and protein **FASTA** files for 2 strains of pseudomonas aerugionsa. With will use these files to exemplify how to use PanXpress.
+---
+## Step 1 — Correcting the Annotation Files
+We recommend applying a correction algorithm to the annotation GFF files using the `panxpress correct_gff` command.
+Proteins are grouped into gene groups via a two-step process:
+1. **Jaccard filtering:** The Jaccard similarity is computed for all pairs of proteins across bacterial strains. Pairs with a similarity score above a threshold `t1` proceed to the next step.
+2. **Alignment filtering:** Surviving pairs are aligned. Pairs with a normalized alignment score above a threshold `t2` are grouped into the same gene group and assigned a shared gene group name. This information is then used to rewrite the annotation files.
+### Arguments
+- `--input_gff_folder` | Path to the folder containing the input annotation `.gff` files.
+- `--input_normalized_gff_folder` | Path to a folder where normalized GFF files will be written. Two corrections are applied: (1) each protein ID is given a single consistent name across strains; (2) genes from plasmids are appended with the suffix `"Plasmid"` to distinguish them from chromosomal genes in expression results.
+- `--output_gff_folder` | Path to the folder where corrected annotation `.gff` files will be stored. Hypothetical proteins are assigned the gene name `"unnamed"`.
+- `--t1` | Similarity threshold for step 1 (Jaccard filtering).
+- `--t2` | Score threshold for step 2 (alignment filtering).
+- `--threads` | Number of threads for parallelized Jaccard similarity calculation and protein pair alignments.
+- `--output_plot` | Path prefix for plots of the alignment score distribution.
+- `--output_folder_data` | Path prefix for output data files (Jaccard similarity values, alignment scores, group names, etc.).
+- `--input_protein_folder` | Path to a folder containing protein `.faa` files. If not available, provide the two arguments below instead.
+- `--input_genome_folder` | *(Alternative to `--input_protein_folder`)* Path to a folder containing genome `.fna` files, used to generate protein files.
+- `--output_protein_folder` | *(Alternative to `--input_protein_folder`)* Path to the folder where generated protein `.faa` files will be stored.
+### How to run (general input)
+**With protein FASTA files provided:**
+```bash
+panxpress correct_gff \
+  --input_protein_folder <folder_with_proteins> \
+  --input_gff_folder <folder_with_annotations> \
+  --input_normalized_gff_folder <folder_for_normalized_annotations> \
+  --output_gff_folder <folder_for_corrected_annotations> \
+  --k 7 \
+  --t1 0.02 \
+  --t2 0.75 \
+  --threads 8 \
+  --output_folder_data <filename_prefix_for_data>
+```
+**Without protein FASTA files (generate them from genomes):**
+```bash
+panxpress correct_gff \
+  --input_genome_folder <folder_with_genomes> \
+  --output_protein_folder <folder_with_proteins> \
+  --input_gff_folder <folder_with_annotations> \
+  --input_normalized_gff_folder <folder_for_normalized_annotations> \
+  --output_gff_folder <folder_for_corrected_annotations> \
+  --k 7 \
+  --t1 0.02 \
+  --t2 0.75 \
+  --threads 8 \
+  --output_folder_data <filename_prefix_for_data>
+```
+### How to run (provided files)
+**With protein FASTA files provided:**
+```bash
+panxpress correct_gff \
+  --input_protein_folder ref/pa_2_strains/protein_ncbi \
+  --input_gff_folder ref/pa_2_strains/gff3 \
+  --input_normalized_gff_folder ref/pa_2_strains/gff3_normalized \
+  --output_gff_folder ref/pa_2_strains/gff3_corrected \
+  --k 7 \
+  --t1 0.02 \
+  --t2 0.75 \
+  --threads 8 \
+  --output_folder_data ref/pa_2_strains
+```
+**Without protein FASTA files (generate them from genomes):**
+```bash
+panxpress correct_gff \
+  --input_genome_folder ref/pa_2_strains/genomes \
+  --output_protein_folder ref/pa_2_strains/protein_agat \
+  --input_gff_folder ref/pa_2_strains/gff3 \
+  --input_normalized_gff_folder ref/pa_2_strains/gff3_normalized \
+  --output_gff_folder ref/pa_2_strains/gff3_corrected \
+  --k 7 \
+  --t1 0.02 \
+  --t2 0.75 \
+  --threads 8 \
+  --output_folder_data ref/pa_2_strains
+```
+---
+## Step 2 — Building the Reference
+The pan-transcriptomic reference is built using the `panxpress build_reference` command.
+The reference is a FASTA file where each header is a unique gene identifier and each entry corresponds to one occurrence of a given gene name in a given strain. Because many gene names are shared across strains, multiple entries may share the same header. Since some tools (e.g., Bowtie2 and Salmon) do not support FASTA files with duplicate headers, PanXpress also generates a version of the reference with fully unique IDs.
+### Arguments
+- `--annotation_dir` | Path to the folder containing the corrected annotation `.gff` files.
+- `--genomes_dir` | Path to the folder containing the genome `.fna` files.
+- `--output_reference_file` | Output FASTA file.
+- `--output_reference_file_unique` | Output FASTA file — unique headers variant.
+- `--output_reference_file_genes` | Output FASTA file containing only named genes (hypothetical proteins excluded).
+- `--output_reference_file_unique_genes` | Output FASTA file containing only named genes — unique headers variant.
+- `--output_aux_files` | Path prefix for auxiliary data files (e.g., mapping of gene names to gene IDs used during the mapping step).
+### How to run (general input)
+```bash
+panxpress build_reference \
+  --annotation_dir <folder_with_annotations> \
+  --genomes_dir <folder_with_genomes> \
+  --output_reference_file <filename>.fna \
+  --output_reference_file_unique <filename>.fna \
+  --output_reference_file_genes <filename>.fna \
+  --output_reference_file_unique_genes <filename>.fna \
+  --output_aux_files <path_prefix_for_auxiliary_data>
+```
+### How to run (provided files)
+```bash
+panxpress build_reference \
+  --annotation_dir ref/pa_2_strains/gff3_corrected \
+  --genomes_dir ref/pa_2_strains/genomes \
+  --output_reference_file ref/spliced_genomes/pantranscriptome_2_strains.fna \
+  --output_reference_file_unique ref/spliced_genomes/pantranscriptome_unique_headers_2_strains.fna \
+  --output_reference_file_genes ref/spliced_genomes/pantranscriptome_genes_2_strains.fna \
+  --output_reference_file_unique_genes ref/spliced_genomes/pantranscriptome_genes_unique_headers_2_strains.fna \
+  --output_aux_files ref/pa_2_strains/pa_2_strains > ref/build_reference_2_strains.log
+```
+---
+## Step 3 — Index Construction
+The index is built using the `panxpress index` command.
+The index is backed by a cuckoo hash table that stores a mapping of gapped k-mers to the genes in which they appear. The number of genes tracked per k-mer is controlled by the `--colorset-size` parameter. If you want PanXpress to maximize the number of colors within memory constraints, provide the total number of genes in the pan-transcriptomic reference via `--ngenes` and the maximum color set size will be calculated automatically.
+### Arguments
+- `--genes` | FASTA file of the reference.
+- `--index` | Output path prefix for the resulting index.
+- `-n` | Estimated number of k-mers.
+- `--ngenes` | Number of unique gene IDs.
+- `--colorset-size` | Number of genes tracked per k-mer entry.
+- `--mask` | Mask pattern used to compute gapped k-mers (e.g., - `"####_###_####_#__#__#_####_###_####"`).
+- `--k` | K-mer length.
+- `--fill` | Fill rate parameter for the cuckoo hash table.
+**Tip**: For pseudomonas aeruginosa, 4 is a good number for the maximum number of colors since only a few k-mers occur in more than 4 different genes. To get a better idea on how to choose `n` and `ngenes` you can check the output from the `build_reference` command. In that output, the maximum gene ID is printed (use this for `ngenes`) and the length of the pan-transcriptome is an upper bound for `n`.
+### How to run (general input)
+```bash
+panxpress index \
+  --genes <reference>.fna \
+  --index <path_prefix_for_index> \
+  -n 20000000 \
+  --ngenes 2600 \
+  --colorset-size 4 \
+  --mask "####_###_####_#__#__#_####_###_####" \
+  --fill 0.95
+```
+### How to run (provided files)
+```bash
+panxpress index \
+  --genes ref/spliced_genomes/pantranscriptome_2_strains.fna \
+  --index ref/pa_2_strains/spliced_index_2_strains \
+  -n 20000000 \
+  --ngenes 2600 \
+  --colorset-size 4 \
+  --mask "####_###_####_#__#__#_####_###_####" \
+  --fill 0.95
+```
+---
+## Step 4 — Mapping
+Read mapping is supported by the `panxpress pmap` command.
+### Arguments
+- `--index` | Path prefix of the index.
+- `--fastq` | Input FASTQ file.
+- `--mapping-file` | Pickle dictionary mapping gene IDs to gene names, generated during the reference building step.
+- `--output-file` | Output folder name.
+- `--threads-mapping` | Number of threads for mapping.
+- `--unnamed_gene_id` | Gene ID for hypothetical proteins. This can be obtained from the mapping file.
+**Tip**: To get the correct value for `unnamed_gene_id`, you can use the following bash command:
+```bash
+unnamed_gene_id=$(python3 -c "import pickle; f='<output_aux_files>_gene_name_to_gene_id'; d=pickle.load(open(f,'rb')); print(d['unnamed'])")
+```
+Note that `output_aux_files` should be the parameter you used in the `build_reference` command.
+### How to run (general input)
+```bash
+panxpress pmap \
+  --index <index_prefix> \
+  --fastq <reads_fastq> \
+  --output-file <output_folder> \
+  --threads-mapping 8 \
+  --mapping-file <gene_id_to_gene_name> \
+  --unnamed_gene_id 3
+```
+### How to run (provided files) - SINGLE END READS
+```bash
+panxpress map \
+  --index ref/pa_2_strains/spliced_index_2_strains \
+  --fastq reads/simulated_regulated_single_reads_pa_3_strains.fq \
+  --output-file results/single_end_reads_pa_3_strains \
+  --threads-mapping 8 \
+  --mapping-file ref/pa_2_strains/pa_2_strains_gene_id_to_gene_name \
+  --unnamed_gene_id $unnamed_gene_id
+```
+### How to run (provided files) - PAIRED END READS
+```bash
+panxpress map \
+  --index ref/pa_2_strains/spliced_index_2_strains \
+  --fastq reads/simulated_regulated_paired_end_reads_pa_3_strains_1.fq \
+  --paired-end reads/simulated_regulated_paired_end_reads_pa_3_strains_2.fq \
+  --output-file results/paired_end_reads_pa_3_strains \
+  --threads-mapping 8 \
+  --mapping-file ref/pa_2_strains/pa_2_strains_gene_id_to_gene_name \
+  --unnamed_gene_id $unnamed_gene_id
+```
+---
+## Step 5 — Gene expression values
+You can additionally convert the information of how many reads are mapped to each gene to gene expression quantification in transcripts per million. This is suported by the `panxpress convert_TPM` command.
+### Arguments
+- `--raw_counts_file` | Output `.mat` file from the mapping step.
+- `--genes_info_file_prefix` | Path prefix for auxiliary data files (e.g., mapping of gene names to gene IDs used during the mapping step). Identical to `output_aux_files` in the build reference command.
+- `--output_file` | Output file name.
+### How to run (general input)
+```bash
+panxpress convert_TPM \
+--raw_counts_file <raw_counts_file> \
+--genes_info_file_prefix <genes_info_file_prefix> \
+--output_file <output_file>
+```
+### How to run (provided files)
+```bash
+panxpress convert_TPM \
+--raw_counts_file results/single_end_reads_pa_3_strains/count.mat \
+--genes_info_file_prefix ref/pa_2_strains/pa_2_strains \
+--output_file results/single_end_reads_pa_3_strains/counts_TPM.txt
+```
+---
+## Full Workflow Scripts
+For ready-to-run workflows on both simulated and real read data, please refer to the README in the `scripts/` folder.