PyPI - pubmatrixpython - Versions diffs - 0.2.0__py3-none-any.whl - Mend

pubmatrixpython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

pubmatrix/__init__.py +9 -0
pubmatrix/core.py +411 -0
pubmatrix/heatmap.py +213 -0
pubmatrixpython-0.2.0.dist-info/METADATA +300 -0
pubmatrixpython-0.2.0.dist-info/RECORD +8 -0
pubmatrixpython-0.2.0.dist-info/WHEEL +4 -0
pubmatrixpython-0.2.0.dist-info/licenses/LICENSE +2 -0
pubmatrixpython-0.2.0.dist-info/licenses/LICENSE.md +21 -0

pubmatrix/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .core import pubmatrix, pubmatrix_from_file
+from .heatmap import plot_pubmatrix_heatmap, pubmatrix_heatmap
+__all__ = [
+    "pubmatrix",
+    "pubmatrix_from_file",
+    "plot_pubmatrix_heatmap",
+    "pubmatrix_heatmap",
+]

pubmatrix/core.py ADDED Viewed

@@ -0,0 +1,411 @@
+"""
+PubMatrix core — systematic literature co-occurrence analysis via NCBI E-utilities.
+Mirrors the R PubMatrixR package (https://github.com/ToledoEM/PubMatrixR-v2).
+Reference: Becker et al. (2003) BMC Bioinformatics 4:61. doi:10.1186/1471-2105-4-61
+"""
+import hashlib
+import json
+import logging
+import math
+import time
+import urllib.parse
+import xml.etree.ElementTree as ET
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import product
+from pathlib import Path
+import pandas as pd
+import requests
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+PUBMED_SEARCH_BASE = "https://www.ncbi.nlm.nih.gov/{db}/?term={term}"
+VALID_DATABASES = {"pubmed", "pmc"}
+VALID_EXPORT_FORMATS = {None, "csv", "ods"}
+# NCBI enforces 3 req/s without an API key, 10 req/s with one.
+_RATE_LIMIT_DEFAULT = 3
+_RATE_LIMIT_API_KEY = 10
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _extract_count(xml_text: str) -> int:
+    """Parse publication count from NCBI esearch XML response."""
+    try:
+        root = ET.fromstring(xml_text)
+    except ET.ParseError as e:
+        raise ValueError(f"Could not parse NCBI XML response: {e}") from e
+    count_el = root.find(".//Count")
+    if count_el is None:
+        raise ValueError("NCBI XML response missing <Count> element")
+    text = (count_el.text or "").strip()
+    if not text.isdigit():
+        raise ValueError(f"<Count> value is not numeric: {text!r}")
+    return int(text)
+def _fetch_count(
+    base_url: str,
+    encoded_term: str,
+    n_tries: int = 2,
+    timeout: int = 30,
+    cache_dir: Path | None = None,
+) -> int:
+    """Fetch publication count for a single search term with retry logic."""
+    url = f"{base_url}&term={encoded_term}&usehistory=y"
+    if cache_dir is not None:
+        cache_key = hashlib.md5(url.encode()).hexdigest()
+        cache_file = cache_dir / f"{cache_key}.json"
+        if cache_file.exists():
+            return json.loads(cache_file.read_text())["count"]
+    last_error = None
+    for attempt in range(n_tries):
+        try:
+            response = requests.get(url, timeout=timeout)
+            response.raise_for_status()
+            count = _extract_count(response.text)
+            if cache_dir is not None:
+                cache_file.write_text(json.dumps({"count": count, "url": url}))
+            return count
+        except requests.RequestException as e:
+            last_error = e
+            if attempt < n_tries - 1:
+                time.sleep(0.25 * (attempt + 1))
+    raise RuntimeError(
+        f"Failed to fetch count after {n_tries} attempts for term {encoded_term!r}: {last_error}"
+    )
+def _validate_daterange(daterange):
+    """Validate and normalise daterange parameter. Returns (start, end) tuple or None."""
+    if daterange is None:
+        return None
+    if len(daterange) != 2:
+        raise ValueError("daterange must have exactly 2 elements: [start_year, end_year]")
+    start, end = daterange
+    if not (math.isfinite(start) and math.isfinite(end)):
+        raise ValueError("daterange values must be finite numbers")
+    start, end = int(round(start)), int(round(end))
+    if start > end:
+        raise ValueError(f"daterange start ({start}) must be <= end ({end})")
+    return (start, end)
+def _build_base_url(database: str, api_key: str | None, daterange) -> str:
+    """Construct the NCBI esearch base URL with optional API key and date range."""
+    params = [f"db={database}", "rettype=count", "retmode=xml"]
+    if api_key:
+        params.append(f"api_key={api_key}")
+    if daterange is not None:
+        start, end = daterange
+        params.append(f"mindate={start}&maxdate={end}&datetype=pdat")
+    return f"{NCBI_BASE}?{'&'.join(params)}"
+def _build_hyperlink_url(database: str, term: str) -> str:
+    """Build a PubMed/PMC search URL for a given term."""
+    encoded = urllib.parse.quote(term)
+    return PUBMED_SEARCH_BASE.format(db=database, term=encoded)
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def pubmatrix(
+    A: list[str],
+    B: list[str],
+    api_key: str | None = None,
+    database: str = "pubmed",
+    daterange=None,
+    outfile: str | None = None,
+    export_format: str | None = None,
+    n_tries: int = 2,
+    n_workers: int = 1,
+    timeout: int = 30,
+    cache_dir: str | None = None,
+) -> pd.DataFrame:
+    """
+    Query PubMed/PMC and build a pairwise co-occurrence matrix.
+    For each pair (a, b) in A × B, counts publications matching 'a AND b'.
+    Parameters
+    ----------
+    A : list of str
+        Search terms for matrix columns.
+    B : list of str
+        Search terms for matrix rows.
+    api_key : str, optional
+        NCBI API key (allows 10 req/s instead of 3 req/s).
+    database : str
+        'pubmed' (default) or 'pmc'.
+    daterange : list or tuple of 2 ints, optional
+        [start_year, end_year] to filter by publication date.
+    outfile : str, optional
+        Base filename for export (required if export_format is set).
+    export_format : str, optional
+        None (no export), 'csv', or 'ods'.
+    n_tries : int
+        Number of retry attempts for failed requests (default 2).
+    n_workers : int
+        Number of parallel workers for concurrent queries (default 1 = serial).
+        Set >1 to speed up large matrices; rate limits are respected automatically.
+    timeout : int
+        HTTP request timeout in seconds (default 30).
+    cache_dir : str, optional
+        Directory to cache query results. Identical queries are loaded from disk
+        instead of re-fetching from NCBI.
+    Returns
+    -------
+    pandas.DataFrame
+        Rows = B terms, columns = A terms, values = publication counts.
+    """
+    # --- Validation (fail-fast, same order as R package) ---
+    if export_format not in VALID_EXPORT_FORMATS:
+        raise ValueError(f"export_format must be one of {VALID_EXPORT_FORMATS}, got {export_format!r}")
+    if export_format is not None and outfile is None:
+        raise ValueError("outfile must be specified when export_format is set")
+    if database not in VALID_DATABASES:
+        raise ValueError(f"database must be one of {VALID_DATABASES}, got {database!r}")
+    if n_tries < 1:
+        raise ValueError(f"n_tries must be >= 1, got {n_tries}")
+    if n_workers < 1:
+        raise ValueError(f"n_workers must be >= 1, got {n_workers}")
+    daterange = _validate_daterange(daterange)
+    if not A or not B:
+        raise ValueError("A and B must be non-empty lists")
+    A = [str(t).strip() for t in A]
+    B = [str(t).strip() for t in B]
+    if any(not t for t in A):
+        raise ValueError("A contains empty or whitespace-only terms")
+    if any(not t for t in B):
+        raise ValueError("B contains empty or whitespace-only terms")
+    # --- Build queries ---
+    pairs = list(product(B, A))  # rows × cols, matches R expand.grid(B, A)
+    encoded_terms = [
+        urllib.parse.quote(f"{b} AND {a}") for b, a in pairs
+    ]
+    base_url = _build_base_url(database, api_key, daterange)
+    # Resolve cache directory once
+    resolved_cache = Path(cache_dir) if cache_dir else None
+    if resolved_cache is not None:
+        resolved_cache.mkdir(parents=True, exist_ok=True)
+    # NCBI rate: 3/s without key, 10/s with key
+    rate_limit = _RATE_LIMIT_API_KEY if api_key else _RATE_LIMIT_DEFAULT
+    min_interval = 1.0 / rate_limit
+    # --- Fetch counts ---
+    counts_map: dict[int, int] = {}
+    if n_workers == 1:
+        # Serial path — simple and predictable
+        for idx, encoded in enumerate(tqdm(encoded_terms, desc="Querying NCBI", unit="query")):
+            counts_map[idx] = _fetch_count(
+                base_url, encoded, n_tries=n_tries, timeout=timeout, cache_dir=resolved_cache
+            )
+            time.sleep(min_interval)
+    else:
+        # Concurrent path — submit all, throttle via sleep between submissions
+        with tqdm(total=len(encoded_terms), desc="Querying NCBI", unit="query") as pbar:
+            with ThreadPoolExecutor(max_workers=n_workers) as executor:
+                future_to_idx = {}
+                for idx, encoded in enumerate(encoded_terms):
+                    future = executor.submit(
+                        _fetch_count, base_url, encoded, n_tries, timeout, resolved_cache
+                    )
+                    future_to_idx[future] = idx
+                    time.sleep(min_interval)
+                for future in as_completed(future_to_idx):
+                    idx = future_to_idx[future]
+                    counts_map[idx] = future.result()
+                    pbar.update(1)
+    counts = [counts_map[i] for i in range(len(encoded_terms))]
+    if len(counts) != len(B) * len(A):
+        raise RuntimeError(
+            f"Expected {len(B) * len(A)} counts, got {len(counts)}"
+        )
+    # --- Assemble matrix (rows=B, cols=A) ---
+    # product(B, A) iterates rows first, so index into the flattened list as [row * n_cols + col]
+    data = {}
+    for j, a in enumerate(A):
+        data[a] = [counts[i * len(A) + j] for i in range(len(B))]
+    df = pd.DataFrame(data, index=B)
+    df.index.name = None
+    # --- Optional export ---
+    if export_format == "csv":
+        _export_csv(df, outfile, database)
+    elif export_format == "ods":
+        _export_ods(df, outfile, database)
+    return df
+def pubmatrix_from_file(filepath: str, **kwargs) -> pd.DataFrame:
+    """
+    Load search terms from a file and run pubmatrix().
+    File format:
+        term_A1
+        term_A2
+        #
+        term_B1
+        term_B2
+    Parameters
+    ----------
+    filepath : str
+        Path to a plain-text file with A terms, a '#' separator, then B terms.
+    **kwargs
+        Passed directly to pubmatrix().
+    Returns
+    -------
+    pandas.DataFrame
+    """
+    path = Path(filepath)
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {filepath}")
+    lines = [ln.strip() for ln in path.read_text().splitlines()]
+    lines = [ln for ln in lines if ln]  # drop blank lines
+    if "#" not in lines:
+        raise ValueError("File must contain a '#' line separating A and B terms")
+    sep = lines.index("#")
+    A = lines[:sep]
+    B = lines[sep + 1:]
+    if not A or not B:
+        raise ValueError("File must contain terms both before and after the '#' separator")
+    return pubmatrix(A=A, B=B, **kwargs)
+# ---------------------------------------------------------------------------
+# Export helpers
+# ---------------------------------------------------------------------------
+def _make_hyperlink_formula(url: str, value: int) -> str:
+    """Excel-compatible HYPERLINK formula."""
+    return f'=HYPERLINK("{url}","{value}")'
+def _export_csv(df: pd.DataFrame, outfile: str, database: str) -> None:
+    """
+    Export matrix to CSV with Excel HYPERLINK formulas.
+    Each cell contains a formula linking to the corresponding PubMed/PMC search.
+    """
+    path = Path(outfile).with_suffix(".csv")
+    rows = []
+    for b_term in df.index:
+        row = {}
+        for a_term in df.columns:
+            term = f"{a_term} AND {b_term}"
+            url = _build_hyperlink_url(database, term)
+            count = df.loc[b_term, a_term]
+            row[a_term] = _make_hyperlink_formula(url, count)
+        rows.append(row)
+    export_df = pd.DataFrame(rows, index=df.index)
+    export_df.to_csv(path)
+    logger.info("Saved CSV to %s", path)
+def _export_ods(df: pd.DataFrame, outfile: str, database: str) -> None:
+    """
+    Export matrix to ODS with clickable hyperlinks.
+    Each cell contains a hyperlink to the corresponding PubMed/PMC search,
+    displayed as the publication count.
+    """
+    from odf.opendocument import OpenDocumentSpreadsheet
+    from odf.style import Style, TextProperties
+    from odf.table import Table, TableRow, TableCell
+    from odf.text import A as OdfA, P
+    path = Path(outfile).with_suffix(".ods")
+    doc = OpenDocumentSpreadsheet()
+    link_style = Style(name="LinkStyle", family="text")
+    link_style.addElement(TextProperties(color="#0000EE", textunderlinestyle="solid"))
+    doc.styles.addElement(link_style)
+    table = Table(name="PubMatrix")
+    # Header row
+    header_row = TableRow()
+    header_row.addElement(TableCell(valuetype="string"))  # empty corner
+    for a_term in df.columns:
+        cell = TableCell(valuetype="string")
+        cell.addElement(P(text=a_term))
+        header_row.addElement(cell)
+    table.addElement(header_row)
+    # Data rows
+    for b_term in df.index:
+        row = TableRow()
+        label_cell = TableCell(valuetype="string")
+        label_cell.addElement(P(text=b_term))
+        row.addElement(label_cell)
+        for a_term in df.columns:
+            count = int(df.loc[b_term, a_term])
+            term = f"{a_term} AND {b_term}"
+            url = _build_hyperlink_url(database, term)
+            cell = TableCell(valuetype="string")
+            p = P()
+            link = OdfA(href=url, text=str(count))
+            p.addElement(link)
+            cell.addElement(p)
+            row.addElement(cell)
+        table.addElement(row)
+    doc.spreadsheet.addElement(table)
+    doc.save(str(path))
+    logger.info("Saved ODS to %s", path)

pubmatrix/heatmap.py ADDED Viewed

@@ -0,0 +1,213 @@
+"""
+PubMatrix heatmap visualisation — mirrors heatmap_functions.R from PubMatrixR.
+Provides overlap-percentage heatmaps with optional hierarchical clustering.
+"""
+import logging
+import warnings
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.colors import LinearSegmentedColormap
+from matplotlib.figure import Figure
+from scipy.cluster.hierarchy import dendrogram, linkage
+from scipy.spatial.distance import pdist
+logger = logging.getLogger(__name__)
+# Default red gradient matching R pheatmap palette
+_RED_GRADIENT = ["#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#99000d"]
+def _to_numeric_matrix(matrix) -> np.ndarray:
+    """Coerce input to a 2-D numeric numpy array."""
+    if isinstance(matrix, pd.DataFrame):
+        arr = matrix.values.astype(float)
+    elif isinstance(matrix, np.ndarray):
+        arr = matrix.astype(float)
+    else:
+        arr = np.array(matrix, dtype=float)
+    if arr.ndim != 2 or arr.shape[0] == 0 or arr.shape[1] == 0:
+        raise ValueError("matrix must be a non-empty 2-D array or DataFrame")
+    return arr
+def _handle_na(arr: np.ndarray) -> np.ndarray:
+    """Replace NaN with 0, emitting a warning if any were found."""
+    nan_mask = np.isnan(arr)
+    if nan_mask.any():
+        positions = list(zip(*np.where(nan_mask)))
+        warnings.warn(
+            f"NA values found at positions {positions[:5]}{'...' if len(positions) > 5 else ''}. "
+            "Converting to 0.",
+            UserWarning,
+            stacklevel=3,
+        )
+        arr = arr.copy()
+        arr[nan_mask] = 0.0
+    return arr
+def _overlap_percentage(arr: np.ndarray) -> np.ndarray:
+    """
+    Compute Jaccard-style overlap percentage for each cell.
+    overlap[i, j] = intersection / union * 100
+    where union = row_total[i] + col_total[j] - intersection
+    """
+    row_totals = arr.sum(axis=1, keepdims=True)   # sum across columns per row
+    col_totals = arr.sum(axis=0, keepdims=True)   # sum across rows per column
+    union = row_totals + col_totals - arr
+    with np.errstate(invalid="ignore", divide="ignore"):
+        pct = np.where(union > 0, arr / union * 100, 0.0)
+    return pct
+def _clustered_order(arr: np.ndarray) -> list[int]:
+    """Return row indices reordered by Euclidean distance / average linkage."""
+    if arr.shape[0] < 2:
+        return list(range(arr.shape[0]))
+    if np.allclose(arr, arr[0]):  # no variation — skip clustering
+        return list(range(arr.shape[0]))
+    dist = pdist(arr, metric="euclidean")
+    Z = linkage(dist, method="average")
+    dend = dendrogram(Z, no_plot=True)
+    return dend["leaves"]
+def _auto_font_size(n_rows: int, n_cols: int) -> float:
+    """Scale annotation font size based on matrix dimensions."""
+    max_dim = max(n_rows, n_cols)
+    if max_dim <= 5:
+        return 10.0
+    elif max_dim <= 10:
+        return 8.0
+    elif max_dim <= 20:
+        return 6.0
+    else:
+        return 4.0
+def plot_pubmatrix_heatmap(
+    matrix,
+    title: str = "PubMatrix Co-occurrence Heatmap",
+    cluster_rows: bool = True,
+    cluster_cols: bool = True,
+    show_numbers: bool = True,
+    color_palette: list[str] | None = None,
+    filename: str | None = None,
+    width: float = 10,
+    height: float = 8,
+    scale_font: bool = True,
+    show: bool = False,
+) -> tuple[Figure, plt.Axes]:
+    """
+    Create a publication-ready heatmap of PubMatrix co-occurrence results.
+    Cell values show overlap percentage: (intersection / union) × 100,
+    where union = row_total + col_total - intersection.
+    Parameters
+    ----------
+    matrix : DataFrame or array-like
+        PubMatrix result (rows = B terms, cols = A terms, values = counts).
+    title : str
+        Heatmap title.
+    cluster_rows, cluster_cols : bool
+        Apply Euclidean distance / average-linkage clustering.
+    show_numbers : bool
+        Annotate cells with overlap percentage values.
+    color_palette : list of str, optional
+        Custom hex color list for gradient. Defaults to red gradient.
+    filename : str, optional
+        Save to this path (PNG). If None, the figure is not saved automatically.
+    width, height : float
+        Figure size in inches.
+    scale_font : bool
+        Auto-scale annotation font size based on matrix dimensions.
+    show : bool
+        Call plt.show() after plotting (default False). Useful in interactive sessions.
+    Returns
+    -------
+    tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]
+    """
+    # --- Input handling ---
+    row_labels = list(matrix.index) if isinstance(matrix, pd.DataFrame) else None
+    col_labels = list(matrix.columns) if isinstance(matrix, pd.DataFrame) else None
+    arr = _to_numeric_matrix(matrix)
+    arr = _handle_na(arr)
+    pct = _overlap_percentage(arr)
+    n_rows, n_cols = arr.shape
+    # --- Clustering ---
+    row_order = _clustered_order(pct) if cluster_rows else list(range(n_rows))
+    col_order = _clustered_order(pct.T) if cluster_cols else list(range(n_cols))
+    pct_ordered = pct[np.ix_(row_order, col_order)]
+    row_labels_ordered = [row_labels[i] for i in row_order] if row_labels else row_order
+    col_labels_ordered = [col_labels[i] for i in col_order] if col_labels else col_order
+    # --- Color map ---
+    colors = color_palette or _RED_GRADIENT
+    cmap = LinearSegmentedColormap.from_list("pubmatrix", colors)
+    # --- Font size ---
+    annot_kws = {}
+    if scale_font:
+        annot_kws["size"] = _auto_font_size(n_rows, n_cols)
+    # --- Plot ---
+    fig, ax = plt.subplots(figsize=(width, height))
+    sns.heatmap(
+        pct_ordered,
+        ax=ax,
+        cmap=cmap,
+        annot=show_numbers,
+        fmt=".1f",
+        annot_kws=annot_kws or None,
+        xticklabels=col_labels_ordered,
+        yticklabels=row_labels_ordered,
+        linewidths=0.5,
+        linecolor="white",
+        cbar_kws={"label": "Overlap %"},
+    )
+    ax.set_title(title, pad=12)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
+    plt.tight_layout()
+    if filename:
+        fig.savefig(filename, dpi=150, bbox_inches="tight")
+        logger.info("Saved heatmap to %s", filename)
+    if show:
+        plt.show()
+    return fig, ax
+def pubmatrix_heatmap(matrix, title: str = "PubMatrix Results") -> tuple[Figure, plt.Axes]:
+    """
+    Convenience wrapper for plot_pubmatrix_heatmap() with default parameters.
+    Parameters
+    ----------
+    matrix : DataFrame or array-like
+        PubMatrix result matrix.
+    title : str
+        Heatmap title.
+    Returns
+    -------
+    tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]
+    """
+    return plot_pubmatrix_heatmap(matrix, title=title)

pubmatrixpython-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,300 @@
+Metadata-Version: 2.4
+Name: pubmatrixpython
+Version: 0.2.0
+Summary: Python port of PubMatrixR — systematic literature co-occurrence analysis via NCBI PubMed
+Project-URL: Homepage, https://toledoem.github.io/pubmatrixp/
+Project-URL: Repository, https://github.com/ToledoEM/PubMatrixPython
+Project-URL: Changelog, https://github.com/ToledoEM/PubMatrixPython/blob/main/CHANGELOG.md
+Author-email: Enrique Toledo <enriquetoledo@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+License-File: LICENSE.md
+Keywords: bioinformatics,co-occurrence,literature-mining,ncbi,pubmed
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Requires-Python: >=3.10
+Requires-Dist: matplotlib<4,>=3.10
+Requires-Dist: pandas<4,>=2.0
+Requires-Dist: requests<3,>=2.33
+Requires-Dist: scipy<2,>=1.10
+Requires-Dist: seaborn<1,>=0.13
+Requires-Dist: tqdm<5,>=4.60
+Provides-Extra: ods
+Requires-Dist: odfpy>=1.4.1; extra == 'ods'
+Description-Content-Type: text/markdown
+# PubMatrixPython v0.2
+<img src="https://toledoem.github.io/img/LogoPubmatrixP.png" align="right" width="150"/>
+![Python](https://img.shields.io/badge/python-3.10%2B-blue)
+![Tests](https://img.shields.io/badge/tests-60%20passed-brightgreen)
+![License](https://img.shields.io/badge/license-MIT-green)
+Python port of the [PubMatrixR](https://github.com/ToledoEM/PubMatrixR-v2) R package.
+For every pair of search terms `(A, B)`, it counts how many PubMed or PMC publications mention both. Good for mapping relationships between genes, diseases, and pathways across the literature.
+Based on: Becker et al. (2003) *PubMatrix: a tool for multiplex literature mining*. BMC Bioinformatics 4:61. https://doi.org/10.1186/1471-2105-4-61
+---
+## Key features
+- **Pairwise literature search** — automatically searches every combination of terms from two lists
+- **PubMed or PMC** — query MEDLINE abstracts or PMC full text via NCBI E-utilities
+- **Heatmap visualisation** — overlap-percentage heatmaps with optional hierarchical clustering
+- **Export to CSV or ODS** — results include clickable hyperlinks to the matching PubMed search
+- **Date filtering** — restrict searches to a publication year range
+- **Flexible input** — pass term lists directly, or load them from a text file
+- **Concurrency** — `n_workers` for parallel queries, respecting NCBI rate limits
+- **Disk caching** — `cache_dir` persists query results between runs
+- **Progress tracking** — built-in progress bar for long searches
+## Use cases
+- **Gene–disease association studies** — explore literature connections between genes and diseases
+- **Pathway analysis** — investigate co-occurrence of genes within or across biological pathways
+- **Drug–target research** — analyse relationships between compounds and potential targets
+- **Systematic literature reviews** — quantify research coverage across multiple topics
+- **Knowledge gap identification** — find under-researched combinations of terms
+- **Bibliometric analysis** — measure research activity in a domain over time
+---
+## Setup
+Requires [uv](https://docs.astral.sh/uv/). Install it with:
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+Clone and install dependencies:
+```bash
+git clone <repo-url>
+cd PubMatrixPython
+uv sync --all-groups
+```
+---
+## Running the notebooks
+All `uv` commands must be run from the **project root** (`PubMatrixPython/`), where `pyproject.toml` lives.
+```bash
+cd /path/to/PubMatrixPython
+uv run jupyter lab
+```
+Then open any notebook from the `notebooks/` folder in the browser.
+| Notebook | What it covers |
+|----------|---------------|
+| `01_pubmatrix.ipynb` | Basic queries, date filtering, PMC database, file input, CSV export, heatmap visualisation |
+| `02_example_wnt.ipynb` | Full worked example: WNT genes × obesity genes |
+---
+## Quick start (script or REPL)
+### Interactive REPL
+```bash
+uv run python
+```
+```python
+from pubmatrix import pubmatrix, plot_pubmatrix_heatmap
+A = ["WNT1", "WNT2", "CTNNB1"]
+B = ["obesity", "diabetes", "cancer"]
+result = pubmatrix(A=A, B=B)
+print(result)
+plot_pubmatrix_heatmap(result, title="WNT × Disease")
+```
+### Running a script
+Create a file `my_analysis.py`:
+```python
+from pubmatrix import pubmatrix, plot_pubmatrix_heatmap
+A = ["WNT1", "WNT2", "WNT3A", "WNT5A", "CTNNB1"]
+B = ["obesity", "diabetes", "cancer", "inflammation"]
+result = pubmatrix(
+    A=A,
+    B=B,
+    database="pubmed",
+    daterange=[2010, 2024],   # optional date filter
+    outfile="results",
+    export_format="csv",      # saves results_result.csv with PubMed hyperlinks
+)
+print(result)
+plot_pubmatrix_heatmap(
+    result,
+    title="WNT Genes × Disease",
+    filename="heatmap.png",   # saves to file instead of displaying
+)
+```
+Run it with:
+```bash
+uv run python my_analysis.py
+```
+### Loading terms from a file
+Create `terms.txt`:
+```
+WNT1
+WNT2
+CTNNB1
+#
+obesity
+diabetes
+cancer
+```
+```python
+from pubmatrix import pubmatrix_from_file
+result = pubmatrix_from_file("terms.txt")
+print(result)
+```
+```bash
+uv run python my_analysis.py
+```
+---
+## API reference
+### `pubmatrix(A, B, ...)`
+Query PubMed and return a `pandas.DataFrame` (rows = B, cols = A).
+```python
+pubmatrix(
+    A,                    # list of str — column terms
+    B,                    # list of str — row terms
+    api_key=None,         # NCBI API key (10 req/s vs 3 req/s default)
+    database="pubmed",    # "pubmed" or "pmc"
+    daterange=None,       # e.g. [2015, 2024]
+    outfile=None,         # base filename for export
+    export_format=None,   # None | "csv" | "ods"
+    n_tries=2,            # retries on network failure
+    n_workers=1,          # parallel workers for concurrent queries
+    timeout=30,           # HTTP request timeout in seconds
+    cache_dir=None,       # directory to cache query results on disk
+)
+```
+### `pubmatrix_from_file(filepath, ...)`
+Load terms from a plain-text file and run `pubmatrix()`.
+File format:
+```
+WNT1
+WNT2
+#
+obesity
+diabetes
+```
+```python
+result = pubmatrix_from_file("terms.txt", database="pubmed")
+```
+### `plot_pubmatrix_heatmap(matrix, ...)`
+Heatmap of overlap percentages with optional hierarchical clustering. Returns `(fig, ax)`.
+```python
+fig, ax = plot_pubmatrix_heatmap(
+    matrix,                                        # DataFrame from pubmatrix()
+    title="PubMatrix Co-occurrence Heatmap",
+    cluster_rows=True,
+    cluster_cols=True,
+    show_numbers=True,
+    color_palette=None,                            # list of hex colours
+    filename=None,                                 # save to PNG if set
+    width=10, height=8,
+    scale_font=True,
+    show=False,                                    # call plt.show() after plotting
+)
+```
+### `pubmatrix_heatmap(matrix, title=...)`
+Quick wrapper around `plot_pubmatrix_heatmap()` with all defaults. Returns `(fig, ax)`.
+---
+## Output files
+When `outfile` and `export_format` are set, results are written to
+`{outfile}_result.{extension}` (`.csv` or `.ods`). Each cell contains the
+publication count and a hyperlink to the matching PubMed search. Row names
+come from `B`, column names from `A`.
+ODS export requires the optional `odfpy` dependency:
+```bash
+pip install pubmatrixpython[ods]
+```
+---
+## NCBI API key
+Without a key: 3 requests/second. With a key: 10 requests/second.
+Get one at https://account.ncbi.nlm.nih.gov/
+```python
+result = pubmatrix(A=A, B=B, api_key="YOUR_KEY_HERE")
+```
+---
+## More documentation
+- [Performance notes](docs/performance.md) — rate limits, caching, concurrency
+- [Troubleshooting](docs/troubleshooting.md) — empty results, rate limiting, slow searches
+- [Full reference notebook](https://toledoem.github.io/pubmatrixp/) — every parameter and feature, with output
+---
+## License & citation
+This project is licensed under the MIT License — see [`LICENSE.md`](LICENSE.md).
+If you use PubMatrixPython in your research, please cite:
+> Becker KG, Hosack DA, Dennis G Jr, Lempicki RA, Bright TJ, Cheadle C, Engel J.
+> *PubMatrix: a tool for multiplex literature mining.*
+> BMC Bioinformatics. 2003 Dec 10;4:61. https://doi.org/10.1186/1471-2105-4-61
+**Developers:**
+- Tyler Laird (Author, original PubMatrixR)
+- Enrique Toledo (Author, maintainer)

pubmatrixpython-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+pubmatrix/__init__.py,sha256=2R1IJspkRVVX9LX9iN_fN3gvbCdwl0NoTaG7AEkaWJE,226
+pubmatrix/core.py,sha256=PuR_u7vF2A-5Em08u6YbTQ9J9XwCOaRK7xEf0ufEP-I,13478
+pubmatrix/heatmap.py,sha256=EB2Bw6y3U2YdXRN8VNjhwB59__jXdDt5fPnV6PFzm1U,6733
+pubmatrixpython-0.2.0.dist-info/METADATA,sha256=B8GoCZ6n-auPVMiYXI4XHkx4mpLM85RH0NzmJClTdxE,8689
+pubmatrixpython-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+pubmatrixpython-0.2.0.dist-info/licenses/LICENSE,sha256=d2_z5YBmmkX6hPR-WEPUp5r2bCINz-6H6fl108AlOck,44
+pubmatrixpython-0.2.0.dist-info/licenses/LICENSE.md,sha256=9hUAiG3FYIg0qkm15NoR-OYK9qV5ypHZvaFolJY0tXA,1073
+pubmatrixpython-0.2.0.dist-info/RECORD,,

pubmatrixpython-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

pubmatrixpython-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ YEAR: 2026
2	+ COPYRIGHT HOLDER: Enrique Toledo

pubmatrixpython-0.2.0.dist-info/licenses/LICENSE.md ADDED Viewed

@@ -0,0 +1,21 @@
+# MIT License
+Copyright (c) 2026 Enrique Toledo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.