PyPI - pycmplot - Versions diffs - 0.2.5__tar.gz → 0.2.6__tar.gz - Mend

pycmplot 0.2.5tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

pycmplot-0.2.6/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Kevin Esoh
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

{pycmplot-0.2.5 → pycmplot-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,15 +1,23 @@
 Metadata-Version: 2.4
 Name: pycmplot
-Version: 0.2.5
+Version: 0.2.6
 Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
 Author: Kevin Esoh
 Author-email: Kevin Esoh <kesohku1@jh.edu>
-License-Expression: CC-BY-NC-SA-4.0
+License-Expression: MIT
 Project-URL: Homepage, https://github.com/esohkevin/pycmplot
 Project-URL: Issues, https://github.com/esohkevin/pycmplot/issues
 Project-URL: Docs, https://pycmplot.readthedocs.io/en/latest/
 Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Natural Language :: English
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Scientific/Engineering :: Visualization
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/__init__.py RENAMED Viewed

@@ -47,4 +47,4 @@ __all__ = [
     "ResourceConfig",
 ]
-__version__ = "0.2.5"
+__version__ = "0.2.6"

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/_core.py RENAMED Viewed

@@ -94,6 +94,7 @@ def main() -> None:
     from pycmplot.plotting.circular import plot_circular
     from pycmplot.plotting.qq import plot_qq_combined, plot_qq_separate, plot_qq_overlay
     from pycmplot.resources import ResourceConfig
+    from pycmplot.annotation import get_annotation_column
     # ------------------------------------------------------------------
     # Parse CLI
@@ -147,7 +148,9 @@ def main() -> None:
     track_heights    = args.track_heights
     linear_track_spacing    = args.linear_track_spacing
     no_track_labels  = args.no_track_labels
+    ylabel           = args.ylabel
     chr_spacing      = args.chr_spacing
+    figure_size      = args.figure_size
     # ------------------------------------------------------------------
@@ -226,23 +229,6 @@ def main() -> None:
     signif_lines = pycmplot_dict["lines"]
     pval_dict = pycmplot_dict["pvals"]
-    # ------------------------------------------------------------------
-    # ANNOTATE BY
-    # ------------------------------------------------------------------
-    label_col = 'SNP'
-    if annotate and not hits_table.empty:
-        if str(annotate).upper() == "GENE" and 'top_gene' in hits_table.columns:
-            label_col = 'top_gene'
-        elif annotate in hits_table.columns:
-            label_col = annotate
-        else:
-            logger.warning(
-                "Annotation column '%s' not found in hits table; "
-                "falling back to 'SNP'.", annotate,
-            )
-        logger.info("Annotate by: %s", label_col)
     # ------------------------------------------------------------------
     # CIRCULAR MANHATTAN
     # ------------------------------------------------------------------
@@ -264,7 +250,6 @@ def main() -> None:
             track_label_size = track_label_size,
             track_label_orientation = track_label_orientation,
             annotate = annotate,
-            label_col = label_col if annotate else None,
             annotation_size = annotation_size,
             hits_table = hits_table,
             sector_sizes = merged_assoc_sector_sizes,
@@ -284,6 +269,9 @@ def main() -> None:
     # ------------------------------------------------------------------
     else:
         logger.info("Generating LINEAR MANHATTAN Plot ...")
+        fsize = figure_size.strip(" ").split(",")
+        fsize = [int(v) for v in fsize]
+        logger.info(f"FIGURE SIZE: {fsize}")
         plot_linear(
             sumstats_loaded=sumstats_loaded,
             track_heights=t_heights,
@@ -295,19 +283,19 @@ def main() -> None:
             highlight_color=highlight_color,
             highlight_line=highlight_line,
             highlight_line_color=highlight_line_color,
-            annotate=annotate,
+            annotate=annotate,
             hits_table=hits_table if not hits_table.empty else None,
-            label_col=label_col if annotate else None,
             chr_spacing=chr_spacing,
             linear_track_spacing=linear_track_spacing,
             colors=colors,
             signif_lines=signif_lines,
             plot_title=plot_title,
             no_track_labels=no_track_labels,
+            ylabel=ylabel,
             dpi=dpi,
             output_format=output_format,
             output_dir=output_dir,
-            figsize=(15, 9)
+            figsize=fsize
         )
     # ------------------------------------------------------------------

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/annotation.py RENAMED Viewed

@@ -570,3 +570,36 @@ def get_hits_summary_table(
         logger.info("Locus summary written to: %s", outpath)
     return _clump_by_distance(locus_table, window_kb=window_kb)
+def get_annotation_column(
+    annotate: str = None,
+    hits_table: pd.DataFrame = None,
+    label_col: str = None,
+):
+    if annotate and not hits_table.empty:
+        if label_col is not None and label_col in hits_table.columns:
+            label_clm = label_col
+        elif annotate in hits_table.columns:
+            label_clm = annotate
+        else:
+            if str(annotate).upper() == "GENE":
+                for i, (_, row) in enumerate(hits_table.iterrows()):
+                    try:
+                        if row["genic"]:
+                            label_clm = "nearest_upstream_gene"
+                            label_msg = "'POS' is genic"
+                        else:
+                            label_clm = "top_gene"
+                            label_msg = "'POS' is not genic"
+                        logger.info("%s", label_msg)
+                    except Exception:
+                        logger.warning(
+                            "Annotation columns '%s' and '%s' not found in hits table: %s; "
+                            "falling back to 'SNP'.", annotate, label_col, hits_table.columns.values,
+                        )
+                        label_clm = 'SNP'
+    logger.info("Annotating by: %s", label_clm)
+    return label_clm

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/cli.py RENAMED Viewed

@@ -63,7 +63,7 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
         File delimiter name; auto-detected when ``None``.
     ``build_column`` : str or None
         Column name containing per-variant genome-build values
-        (``hg19`` / ``hg38``).
+        (``hg18`` / ``hg19`` / ``hg38``).
     ``build`` : str or None
         Comma-separated list of genome builds per summary statistics file,
         in the same order as ``sum_stats``.  Alternative to ``build_column``.
@@ -138,6 +138,10 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
         Track sort order.
     ``no_track_labels`` : bool
         Suppress track label rendering when ``True``.
+    ``ylabel`` : str or None
+        Shared y-axis label for linear Manhattan plots.  Override the
+        default (``"-log₁₀(p-value)"`` or the p-value column name) for
+        non-p-value statistics such as ``"iHS"`` or ``"F_ST"``.
     ``plot_title`` : str
         Plot title and output file stem.  Default ``'MyCMplot'``.
     ``plot_title_size`` : float
@@ -252,11 +256,14 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
     opt.add_argument(
         "-b","--build", default=None, required=False, type=str, metavar='str',
         help=
-        """Comma-sperated list of genome build of summary stats file(s) listed
-        in the same order as sumstats files. e.g. hg19,hg38,hg38,hg19 means:
+        """Comma-separated list of genome build of summary stats file(s) listed
+        in the same order as sumstats files. Accepted values: hg18, hg19, hg38.
+        E.g. hg19,hg38,hg38,hg18 means:
         file1.txt.gz --> hg19
         file2.txt.gz --> hg38
-        file3.tsv --> hg38 ... etc
+        file3.tsv --> hg38
+        file4.tsv --> hg18 ... etc
+        hg18 and hg19 coordinates are lifted to hg38 before plotting.
         """
     )
     opt.add_argument(
@@ -365,7 +372,17 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
             "Exclude track labels from plot. (default: False)"
         ),
         action="store_true"
-    )
+    )
+    opt.add_argument(
+        "-yl", "--ylabel",
+        default=None, type=str, metavar="str",
+        help=(
+            "Shared y-axis label for linear Manhattan plots (left margin). "
+            "Useful for non-p-value statistics such as iHS, F_ST or "
+            "XP-EHH (e.g. --ylabel 'iHS'). Defaults to '-log10(p-value)' "
+            "when --logp is set, otherwise the p-value column name."
+        )
+    )
     opt.add_argument(
         "-plt", "--plot_title", default="MyCMplot", type=str, metavar="str",
         help="Plot plot_title / output file stem."
@@ -439,7 +456,10 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
         "-t_space", "--linear_track_spacing", default=0.10, type=float, metavar="float",
         help="Space between linear tracks (default: 0.10)."
     )
+    lio.add_argument(
+        "-figsize", "--figure_size", default='10,4', required=False, type=str, metavar="str",
+        help="Linear plot figure size (default: 10,4 for width,height)."
+    )
     opt.add_argument(
         "-h", "--help", action="help",
         help="Show this help message and exit."

pycmplot-0.2.6/pycmplot/data/hg18ToHg38.over.chain.gz ADDED Viewed

Binary file

pycmplot-0.2.6/pycmplot/data/hg19ToHg38.over.chain.gz ADDED Viewed

Binary file

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/io.py RENAMED Viewed

@@ -644,11 +644,11 @@ def prep_pycmplot_input_info(
                 bcol:      "BUILD",
             }
             col_dtypes = {
-                chrom_col: str,
+                chrom_col: 'category',
                 pos_col:   object,
                 snp_col:   str,
                 pcol_col:  float,
-                bcol:      str,
+                bcol:      'category',
             }
             sumstats_hdr_dic[name] = [old_cols, col_dtypes, new_cols, file_sep]
@@ -662,7 +662,7 @@ def prep_pycmplot_input_info(
                 pcol_col:  "P",
             }
             col_dtypes = {
-                chrom_col: str,
+                chrom_col: 'category',
                 pos_col:   object,
                 snp_col:   str,
                 pcol_col:  float,
@@ -681,18 +681,26 @@ def prep_pycmplot_input_info(
                 pcol_col:  "P",
             }
             col_dtypes = {
-                chrom_col: str,
+                chrom_col: 'category',
                 pos_col:   object,
                 snp_col:   str,
                 pcol_col:  float,
             }
             sumstats_hdr_dic[name] = [old_cols, col_dtypes, new_cols, file_sep]
-    if not any(len(info) == 5 for info in sumstats_hdr_dic.values()):
+    def _has_build_info(info: list) -> bool:
+        """A file has build info when either (a) its header had a build
+        column (which is stored as a fifth entry in ``old_cols``), or
+        (b) a per-file build was supplied via ``--build`` (stored as a
+        fifth entry in the top-level list)."""
+        old_cols = info[0]
+        return len(old_cols) == 5 or len(info) == 5
+    if not any(_has_build_info(info) for info in sumstats_hdr_dic.values()):
         # Neither build column nor --build was available for any file
         logger.warning(
             "No build column or --build values detected. Summary stats will "
-            "be plotted in their respective coordinate systems. If your data "
+            "be plotted in their native coordinate systems. If your data "
             "are in different coordinate systems, combining them in one plot "
             "is not advisable, especially if ``--annotate`` is set!"
         )
@@ -713,6 +721,23 @@ def _merge_min_max_lists(dicts: list[dict]) -> dict:
     return {k: [min(v), max(v)] for k, v in temp.items()}
+# ---------------------------------------------------------------------------
+# Memory usage
+# ---------------------------------------------------------------------------
+def _get_memory_usage(mem_df: int):
+    if mem_df > 1e6:
+        df_mem = mem_df / 1e9
+        unit = 'GB'
+    else:
+        df_mem = mem_df / 1e6
+        unit = 'MB'
+    if df_mem < 1:
+        df_mem = df_mem * 100
+        unit = 'MB'
+    return f"{df_mem:.3g} {unit}"
 # ---------------------------------------------------------------------------
 # Main loader
 # ---------------------------------------------------------------------------
@@ -874,20 +899,27 @@ def get_sumstats_and_merged_sector_list(
         ).rename(columns=sumstat_newcols)
         df["POS"] = pd.to_numeric(df["POS"], errors="coerce").astype("Int64").dropna()
+        pre_trim_mem = _get_memory_usage(df.memory_usage(deep=True).sum())
+        pre_trim_vars = len(df.index)
+        logger.info("Loaded %s variants from summary stat file, using %s of memory", pre_trim_vars, pre_trim_mem)
         # Get dict of p-values for qq-plotting before applying trim_pval
         logger.info("Extracting raw p-values for QQ-plotting ...")
-        pval_dict[label] = df["P"].dropna().astype("float").values
+        pval_dict[label] = df["P"].dropna().astype(float).values
         # Add build column if not exist and build supplied
         if build:
             df['BUILD'] = build
+            df['BUILD'] = df['BUILD'].astype('category')
         # Trim insignificant variants for faster plotting
         if trim_pval:
             logger.info("Excluding variants with p-value less than %s to speed up Manhattan plotting ...", trim_pval)
             df = df[df["P"].astype(float) <= float(trim_pval)]
+            post_trim_mem = _get_memory_usage(df.memory_usage(deep=True).sum())
+            post_trim_vars = len(df.index)
+            logger.info("%s variants remain after trimming, using %s of memory", post_trim_vars, post_trim_mem)
         else:
             df = df[df["P"].astype(float) <= 1]
@@ -911,9 +943,16 @@ def get_sumstats_and_merged_sector_list(
         n_chroms = len(df["CHR"].unique()) - 1
         sumstats_loaded[label] = [df, n_chroms]
-        # Liftover hg19 data if needed
-        if "BUILD" in df.columns and "hg19" in df["BUILD"].unique():
-            logger.info("Converting hg19 coordinates to hg38 ...")
+        # Liftover hg18/hg19 data if needed
+        if "BUILD" in df.columns and (
+            "hg19" in df["BUILD"].unique() or "hg18" in df["BUILD"].unique()
+        ):
+            builds_present = sorted(
+                b for b in df["BUILD"].unique() if b in {"hg18", "hg19"}
+            )
+            logger.info(
+                "Converting %s coordinates to hg38 ...", "/".join(builds_present)
+            )
             sumstats_loaded[label][0] = liftover_position(df, resources=resources)
         # Lead SNPs
@@ -1002,6 +1041,7 @@ def get_sumstats_and_merged_sector_list(
     assoc_sector_sizes_list: list[dict] = []
     min_dic_val = None
+    logger.info("Computing per-sumstat sector sizes (chrom → [min_pos, max_pos])")
     for df, _n in sumstats_loaded.values():
         assoc = df[~(df["CHR"].str.len() > 2)].copy()
         assoc["POS"] = assoc["POS"].fillna(0).astype(int)

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/liftover.py RENAMED Viewed

@@ -2,23 +2,34 @@
 pycmplot.liftover
 =================
-Genome coordinate liftover utilities (hg19 → hg38).
+Genome coordinate liftover utilities (hg18 → hg38 and hg19 → hg38).
-The :class:`pyliftover.LiftOver` object is initialised **lazily** — it is
-created on first use and cached in a module-level dictionary, so importing
-this module never triggers a file-not-found error even if the chain file has
-not been configured yet.
+The :class:`pyliftover.LiftOver` objects are initialised **lazily** — they
+are created on first use and cached in a module-level dictionary, so
+importing this module never triggers a file-not-found error even if the
+chain files have not been configured yet.
+Supported conversions
+---------------------
+pycmplot harmonises input coordinates to GRCh38. Two source assemblies are
+supported:
+* ``hg19`` / GRCh37 → GRCh38 (default, bundled chain file)
+* ``hg18`` / NCBI36 → GRCh38 (bundled chain file; used when input rows
+  carry a ``hg18`` build label)
 Resource configuration
 ----------------------
-The chain file path is resolved through
-:class:`~pycmplot.resources.ResourceConfig`.  By default, a bundled chain
-file is used (``pycmplot/data/hg19ToHg38.over.chain``).  This can be
-overridden by setting the environment variable:
+Chain file paths are resolved through
+:class:`~pycmplot.resources.ResourceConfig`.  By default, bundled chain
+files are used (``pycmplot/data/hg19ToHg38.over.chain.gz`` and
+``pycmplot/data/hg18ToHg38.over.chain.gz``).  They can be overridden by
+setting the environment variables:
 .. code-block:: bash
-    export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain
+    export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain.gz
+    export PYCMPLOT_CHAIN_HG18_HG38=/path/to/hg18ToHg38.over.chain.gz
 """
 from __future__ import annotations
@@ -135,17 +146,71 @@ def liftover_hg19_to_hg38(
     return new_pos
+def liftover_hg18_to_hg38(
+    chrom: str,
+    pos: int,
+    resources: Optional[ResourceConfig] = None,
+) -> Optional[int]:
+    """Convert a single hg18 (NCBI36) position to its hg38 equivalent.
+    Uses a lazily loaded and cached :class:`~pyliftover.LiftOver` object
+    backed by the hg18→hg38 chain file specified in *resources*.  When
+    multiple hg38 mappings exist for a given position, the one with the
+    highest chain score is returned.
+    Parameters
+    ----------
+    chrom : str
+        Chromosome name **without** the ``'chr'`` prefix (e.g. ``'1'``,
+        ``'X'``).  The prefix is added internally before querying
+        pyliftover.
+    pos : int
+        0-based hg18 position, as expected by :class:`pyliftover.LiftOver`.
+    resources : ResourceConfig, optional
+        :class:`~pycmplot.resources.ResourceConfig` instance.  Falls back
+        to :data:`~pycmplot.resources.default_resources` when ``None``.
+    Returns
+    -------
+    int or None
+        Corresponding 0-based hg38 position, or ``None`` if the position
+        could not be mapped (unmapped region, chromosome gap, or deleted
+        sequence).
+    See Also
+    --------
+    liftover_hg19_to_hg38 :
+        Equivalent helper for hg19 coordinates.
+    liftover_position :
+        Applies the appropriate per-row dispatcher to a full DataFrame.
+    """
+    if resources is None:
+        resources = default_resources
+    chain_path = resources.require("chain_hg18_hg38")
+    lo = _get_liftover(chain_path)
+    results = lo.convert_coordinate(f"chr{chrom}", pos)
+    if not results:
+        return None
+    _new_chrom, new_pos, _strand, _score = results[0]
+    return new_pos
 def liftover_position(
     df: pd.DataFrame,
     hg38_chr_limits: dict = None,
     resources: Optional[ResourceConfig] = None,
 ) -> pd.DataFrame:
-    """Liftover all hg19 rows in *df* from hg19 to hg38 coordinates.
+    """Liftover all hg18/hg19 rows in *df* to hg38 coordinates.
-    Iterates over every row in *df* and calls :func:`liftover_hg19_to_hg38`
-    for rows whose ``BUILD`` column equals ``'hg19'``.  Rows with other build
-    values are passed through unchanged.  Rows for which liftover returns
-    ``None`` or ``0`` (unmappable positions) are silently dropped.
+    Iterates over every row in *df* and dispatches to
+    :func:`liftover_hg19_to_hg38` for rows whose ``BUILD`` column equals
+    ``'hg19'`` or to :func:`liftover_hg18_to_hg38` for rows whose ``BUILD``
+    column equals ``'hg18'``.  Rows with any other build value are passed
+    through unchanged.  Rows for which liftover returns ``None`` or ``0``
+    (unmappable positions) are silently dropped.
     Two provenance columns are added to the returned DataFrame so that the
     original coordinates remain accessible:
@@ -207,6 +272,8 @@ def liftover_position(
     for chrom, pos, build in zip(df["CHR"], df["POS"], df["BUILD"]):
         if build == "hg19":
             new_positions.append(liftover_hg19_to_hg38(chrom, pos, resources))
+        elif build == "hg18":
+            new_positions.append(liftover_hg18_to_hg38(chrom, pos, resources))
         else:
             new_positions.append(pos)

{pycmplot-0.2.5 → pycmplot-0.2.6}/pycmplot/plotting/circular.py RENAMED Viewed

@@ -29,6 +29,7 @@ import pandas as pd
 from pycmplot.io import get_output_paths
 from pycmplot.stats import get_highlight_snps
+from pycmplot.annotation import get_annotation_column
 logger = logging.getLogger(__name__)
@@ -622,25 +623,18 @@ def plot_circular(
     # Circular: gene/SNP annotations
     # ------------------------------------------------------------------
     if annotate and not hits_table.empty:
+        label_col = get_annotation_column(
+            annotate = annotate,
+            hits_table=hits_table,
+            label_col=label_col,
+        )
+        if label_col == 'SNP':
+            fstyle = "normal"
+        else:
+            fstyle = "italic"
         for i, (_, row) in enumerate(hits_table.iterrows()):
-            label = row['SNP']
-            fstyle = "normal"
-            if label_col:
-                label_col = str(label_col)
-                try:
-                    if label_col == "GENE":
-                        if row["genic"]:
-                            label = row["nearest_upstream_gene"]
-                        else:
-                            label = row["top_gene"]
-                            fstyle = "italic"
-                    elif label_col != "SNP":
-                        label = row[label_col]
-                        fstyle = "italic"
-                except Exception:
-                    logger.info("'SNP' column is used for annotation since '%s' column could not be resolved in hits table.", label_col)
-                    pass
+            label = row[label_col]
             for sector in circos.sectors:
                 if str(row["CHR"]) == sector.name:
                     a_track = sector.add_track(annotation_track_radius)

pycmplot 0.2.5__tar.gz → 0.2.6__tar.gz

pycmplot 0.2.5tar.gz → 0.2.6tar.gz