pubmatrixpython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pubmatrix/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from .core import pubmatrix, pubmatrix_from_file
2
+ from .heatmap import plot_pubmatrix_heatmap, pubmatrix_heatmap
3
+
4
+ __all__ = [
5
+ "pubmatrix",
6
+ "pubmatrix_from_file",
7
+ "plot_pubmatrix_heatmap",
8
+ "pubmatrix_heatmap",
9
+ ]
pubmatrix/core.py ADDED
@@ -0,0 +1,411 @@
1
+ """
2
+ PubMatrix core — systematic literature co-occurrence analysis via NCBI E-utilities.
3
+
4
+ Mirrors the R PubMatrixR package (https://github.com/ToledoEM/PubMatrixR-v2).
5
+ Reference: Becker et al. (2003) BMC Bioinformatics 4:61. doi:10.1186/1471-2105-4-61
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import logging
11
+ import math
12
+ import time
13
+ import urllib.parse
14
+ import xml.etree.ElementTree as ET
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
+ from itertools import product
17
+ from pathlib import Path
18
+
19
+ import pandas as pd
20
+ import requests
21
+ from tqdm import tqdm
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
26
+ PUBMED_SEARCH_BASE = "https://www.ncbi.nlm.nih.gov/{db}/?term={term}"
27
+
28
+ VALID_DATABASES = {"pubmed", "pmc"}
29
+ VALID_EXPORT_FORMATS = {None, "csv", "ods"}
30
+
31
+ # NCBI enforces 3 req/s without an API key, 10 req/s with one.
32
+ _RATE_LIMIT_DEFAULT = 3
33
+ _RATE_LIMIT_API_KEY = 10
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Internal helpers
38
+ # ---------------------------------------------------------------------------
39
+
40
+ def _extract_count(xml_text: str) -> int:
41
+ """Parse publication count from NCBI esearch XML response."""
42
+ try:
43
+ root = ET.fromstring(xml_text)
44
+ except ET.ParseError as e:
45
+ raise ValueError(f"Could not parse NCBI XML response: {e}") from e
46
+
47
+ count_el = root.find(".//Count")
48
+ if count_el is None:
49
+ raise ValueError("NCBI XML response missing <Count> element")
50
+
51
+ text = (count_el.text or "").strip()
52
+ if not text.isdigit():
53
+ raise ValueError(f"<Count> value is not numeric: {text!r}")
54
+
55
+ return int(text)
56
+
57
+
58
+ def _fetch_count(
59
+ base_url: str,
60
+ encoded_term: str,
61
+ n_tries: int = 2,
62
+ timeout: int = 30,
63
+ cache_dir: Path | None = None,
64
+ ) -> int:
65
+ """Fetch publication count for a single search term with retry logic."""
66
+ url = f"{base_url}&term={encoded_term}&usehistory=y"
67
+
68
+ if cache_dir is not None:
69
+ cache_key = hashlib.md5(url.encode()).hexdigest()
70
+ cache_file = cache_dir / f"{cache_key}.json"
71
+ if cache_file.exists():
72
+ return json.loads(cache_file.read_text())["count"]
73
+
74
+ last_error = None
75
+ for attempt in range(n_tries):
76
+ try:
77
+ response = requests.get(url, timeout=timeout)
78
+ response.raise_for_status()
79
+ count = _extract_count(response.text)
80
+ if cache_dir is not None:
81
+ cache_file.write_text(json.dumps({"count": count, "url": url}))
82
+ return count
83
+ except requests.RequestException as e:
84
+ last_error = e
85
+ if attempt < n_tries - 1:
86
+ time.sleep(0.25 * (attempt + 1))
87
+
88
+ raise RuntimeError(
89
+ f"Failed to fetch count after {n_tries} attempts for term {encoded_term!r}: {last_error}"
90
+ )
91
+
92
+
93
+ def _validate_daterange(daterange):
94
+ """Validate and normalise daterange parameter. Returns (start, end) tuple or None."""
95
+ if daterange is None:
96
+ return None
97
+
98
+ if len(daterange) != 2:
99
+ raise ValueError("daterange must have exactly 2 elements: [start_year, end_year]")
100
+
101
+ start, end = daterange
102
+ if not (math.isfinite(start) and math.isfinite(end)):
103
+ raise ValueError("daterange values must be finite numbers")
104
+
105
+ start, end = int(round(start)), int(round(end))
106
+ if start > end:
107
+ raise ValueError(f"daterange start ({start}) must be <= end ({end})")
108
+
109
+ return (start, end)
110
+
111
+
112
+ def _build_base_url(database: str, api_key: str | None, daterange) -> str:
113
+ """Construct the NCBI esearch base URL with optional API key and date range."""
114
+ params = [f"db={database}", "rettype=count", "retmode=xml"]
115
+
116
+ if api_key:
117
+ params.append(f"api_key={api_key}")
118
+
119
+ if daterange is not None:
120
+ start, end = daterange
121
+ params.append(f"mindate={start}&maxdate={end}&datetype=pdat")
122
+
123
+ return f"{NCBI_BASE}?{'&'.join(params)}"
124
+
125
+
126
+ def _build_hyperlink_url(database: str, term: str) -> str:
127
+ """Build a PubMed/PMC search URL for a given term."""
128
+ encoded = urllib.parse.quote(term)
129
+ return PUBMED_SEARCH_BASE.format(db=database, term=encoded)
130
+
131
+
132
+ # ---------------------------------------------------------------------------
133
+ # Public API
134
+ # ---------------------------------------------------------------------------
135
+
136
+ def pubmatrix(
137
+ A: list[str],
138
+ B: list[str],
139
+ api_key: str | None = None,
140
+ database: str = "pubmed",
141
+ daterange=None,
142
+ outfile: str | None = None,
143
+ export_format: str | None = None,
144
+ n_tries: int = 2,
145
+ n_workers: int = 1,
146
+ timeout: int = 30,
147
+ cache_dir: str | None = None,
148
+ ) -> pd.DataFrame:
149
+ """
150
+ Query PubMed/PMC and build a pairwise co-occurrence matrix.
151
+
152
+ For each pair (a, b) in A × B, counts publications matching 'a AND b'.
153
+
154
+ Parameters
155
+ ----------
156
+ A : list of str
157
+ Search terms for matrix columns.
158
+ B : list of str
159
+ Search terms for matrix rows.
160
+ api_key : str, optional
161
+ NCBI API key (allows 10 req/s instead of 3 req/s).
162
+ database : str
163
+ 'pubmed' (default) or 'pmc'.
164
+ daterange : list or tuple of 2 ints, optional
165
+ [start_year, end_year] to filter by publication date.
166
+ outfile : str, optional
167
+ Base filename for export (required if export_format is set).
168
+ export_format : str, optional
169
+ None (no export), 'csv', or 'ods'.
170
+ n_tries : int
171
+ Number of retry attempts for failed requests (default 2).
172
+ n_workers : int
173
+ Number of parallel workers for concurrent queries (default 1 = serial).
174
+ Set >1 to speed up large matrices; rate limits are respected automatically.
175
+ timeout : int
176
+ HTTP request timeout in seconds (default 30).
177
+ cache_dir : str, optional
178
+ Directory to cache query results. Identical queries are loaded from disk
179
+ instead of re-fetching from NCBI.
180
+
181
+ Returns
182
+ -------
183
+ pandas.DataFrame
184
+ Rows = B terms, columns = A terms, values = publication counts.
185
+ """
186
+ # --- Validation (fail-fast, same order as R package) ---
187
+ if export_format not in VALID_EXPORT_FORMATS:
188
+ raise ValueError(f"export_format must be one of {VALID_EXPORT_FORMATS}, got {export_format!r}")
189
+
190
+ if export_format is not None and outfile is None:
191
+ raise ValueError("outfile must be specified when export_format is set")
192
+
193
+ if database not in VALID_DATABASES:
194
+ raise ValueError(f"database must be one of {VALID_DATABASES}, got {database!r}")
195
+
196
+ if n_tries < 1:
197
+ raise ValueError(f"n_tries must be >= 1, got {n_tries}")
198
+
199
+ if n_workers < 1:
200
+ raise ValueError(f"n_workers must be >= 1, got {n_workers}")
201
+
202
+ daterange = _validate_daterange(daterange)
203
+
204
+ if not A or not B:
205
+ raise ValueError("A and B must be non-empty lists")
206
+
207
+ A = [str(t).strip() for t in A]
208
+ B = [str(t).strip() for t in B]
209
+
210
+ if any(not t for t in A):
211
+ raise ValueError("A contains empty or whitespace-only terms")
212
+ if any(not t for t in B):
213
+ raise ValueError("B contains empty or whitespace-only terms")
214
+
215
+ # --- Build queries ---
216
+ pairs = list(product(B, A)) # rows × cols, matches R expand.grid(B, A)
217
+ encoded_terms = [
218
+ urllib.parse.quote(f"{b} AND {a}") for b, a in pairs
219
+ ]
220
+
221
+ base_url = _build_base_url(database, api_key, daterange)
222
+
223
+ # Resolve cache directory once
224
+ resolved_cache = Path(cache_dir) if cache_dir else None
225
+ if resolved_cache is not None:
226
+ resolved_cache.mkdir(parents=True, exist_ok=True)
227
+
228
+ # NCBI rate: 3/s without key, 10/s with key
229
+ rate_limit = _RATE_LIMIT_API_KEY if api_key else _RATE_LIMIT_DEFAULT
230
+ min_interval = 1.0 / rate_limit
231
+
232
+ # --- Fetch counts ---
233
+ counts_map: dict[int, int] = {}
234
+
235
+ if n_workers == 1:
236
+ # Serial path — simple and predictable
237
+ for idx, encoded in enumerate(tqdm(encoded_terms, desc="Querying NCBI", unit="query")):
238
+ counts_map[idx] = _fetch_count(
239
+ base_url, encoded, n_tries=n_tries, timeout=timeout, cache_dir=resolved_cache
240
+ )
241
+ time.sleep(min_interval)
242
+ else:
243
+ # Concurrent path — submit all, throttle via sleep between submissions
244
+ with tqdm(total=len(encoded_terms), desc="Querying NCBI", unit="query") as pbar:
245
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
246
+ future_to_idx = {}
247
+ for idx, encoded in enumerate(encoded_terms):
248
+ future = executor.submit(
249
+ _fetch_count, base_url, encoded, n_tries, timeout, resolved_cache
250
+ )
251
+ future_to_idx[future] = idx
252
+ time.sleep(min_interval)
253
+
254
+ for future in as_completed(future_to_idx):
255
+ idx = future_to_idx[future]
256
+ counts_map[idx] = future.result()
257
+ pbar.update(1)
258
+
259
+ counts = [counts_map[i] for i in range(len(encoded_terms))]
260
+
261
+ if len(counts) != len(B) * len(A):
262
+ raise RuntimeError(
263
+ f"Expected {len(B) * len(A)} counts, got {len(counts)}"
264
+ )
265
+
266
+ # --- Assemble matrix (rows=B, cols=A) ---
267
+ # product(B, A) iterates rows first, so index into the flattened list as [row * n_cols + col]
268
+ data = {}
269
+ for j, a in enumerate(A):
270
+ data[a] = [counts[i * len(A) + j] for i in range(len(B))]
271
+
272
+ df = pd.DataFrame(data, index=B)
273
+ df.index.name = None
274
+
275
+ # --- Optional export ---
276
+ if export_format == "csv":
277
+ _export_csv(df, outfile, database)
278
+ elif export_format == "ods":
279
+ _export_ods(df, outfile, database)
280
+
281
+ return df
282
+
283
+
284
+ def pubmatrix_from_file(filepath: str, **kwargs) -> pd.DataFrame:
285
+ """
286
+ Load search terms from a file and run pubmatrix().
287
+
288
+ File format:
289
+ term_A1
290
+ term_A2
291
+ #
292
+ term_B1
293
+ term_B2
294
+
295
+ Parameters
296
+ ----------
297
+ filepath : str
298
+ Path to a plain-text file with A terms, a '#' separator, then B terms.
299
+ **kwargs
300
+ Passed directly to pubmatrix().
301
+
302
+ Returns
303
+ -------
304
+ pandas.DataFrame
305
+ """
306
+ path = Path(filepath)
307
+ if not path.exists():
308
+ raise FileNotFoundError(f"File not found: {filepath}")
309
+
310
+ lines = [ln.strip() for ln in path.read_text().splitlines()]
311
+ lines = [ln for ln in lines if ln] # drop blank lines
312
+
313
+ if "#" not in lines:
314
+ raise ValueError("File must contain a '#' line separating A and B terms")
315
+
316
+ sep = lines.index("#")
317
+ A = lines[:sep]
318
+ B = lines[sep + 1:]
319
+
320
+ if not A or not B:
321
+ raise ValueError("File must contain terms both before and after the '#' separator")
322
+
323
+ return pubmatrix(A=A, B=B, **kwargs)
324
+
325
+
326
+ # ---------------------------------------------------------------------------
327
+ # Export helpers
328
+ # ---------------------------------------------------------------------------
329
+
330
+ def _make_hyperlink_formula(url: str, value: int) -> str:
331
+ """Excel-compatible HYPERLINK formula."""
332
+ return f'=HYPERLINK("{url}","{value}")'
333
+
334
+
335
+ def _export_csv(df: pd.DataFrame, outfile: str, database: str) -> None:
336
+ """
337
+ Export matrix to CSV with Excel HYPERLINK formulas.
338
+
339
+ Each cell contains a formula linking to the corresponding PubMed/PMC search.
340
+ """
341
+ path = Path(outfile).with_suffix(".csv")
342
+ rows = []
343
+
344
+ for b_term in df.index:
345
+ row = {}
346
+ for a_term in df.columns:
347
+ term = f"{a_term} AND {b_term}"
348
+ url = _build_hyperlink_url(database, term)
349
+ count = df.loc[b_term, a_term]
350
+ row[a_term] = _make_hyperlink_formula(url, count)
351
+ rows.append(row)
352
+
353
+ export_df = pd.DataFrame(rows, index=df.index)
354
+ export_df.to_csv(path)
355
+ logger.info("Saved CSV to %s", path)
356
+
357
+
358
+ def _export_ods(df: pd.DataFrame, outfile: str, database: str) -> None:
359
+ """
360
+ Export matrix to ODS with clickable hyperlinks.
361
+
362
+ Each cell contains a hyperlink to the corresponding PubMed/PMC search,
363
+ displayed as the publication count.
364
+ """
365
+ from odf.opendocument import OpenDocumentSpreadsheet
366
+ from odf.style import Style, TextProperties
367
+ from odf.table import Table, TableRow, TableCell
368
+ from odf.text import A as OdfA, P
369
+
370
+ path = Path(outfile).with_suffix(".ods")
371
+ doc = OpenDocumentSpreadsheet()
372
+
373
+ link_style = Style(name="LinkStyle", family="text")
374
+ link_style.addElement(TextProperties(color="#0000EE", textunderlinestyle="solid"))
375
+ doc.styles.addElement(link_style)
376
+
377
+ table = Table(name="PubMatrix")
378
+
379
+ # Header row
380
+ header_row = TableRow()
381
+ header_row.addElement(TableCell(valuetype="string")) # empty corner
382
+ for a_term in df.columns:
383
+ cell = TableCell(valuetype="string")
384
+ cell.addElement(P(text=a_term))
385
+ header_row.addElement(cell)
386
+ table.addElement(header_row)
387
+
388
+ # Data rows
389
+ for b_term in df.index:
390
+ row = TableRow()
391
+ label_cell = TableCell(valuetype="string")
392
+ label_cell.addElement(P(text=b_term))
393
+ row.addElement(label_cell)
394
+
395
+ for a_term in df.columns:
396
+ count = int(df.loc[b_term, a_term])
397
+ term = f"{a_term} AND {b_term}"
398
+ url = _build_hyperlink_url(database, term)
399
+
400
+ cell = TableCell(valuetype="string")
401
+ p = P()
402
+ link = OdfA(href=url, text=str(count))
403
+ p.addElement(link)
404
+ cell.addElement(p)
405
+ row.addElement(cell)
406
+
407
+ table.addElement(row)
408
+
409
+ doc.spreadsheet.addElement(table)
410
+ doc.save(str(path))
411
+ logger.info("Saved ODS to %s", path)
pubmatrix/heatmap.py ADDED
@@ -0,0 +1,213 @@
1
+ """
2
+ PubMatrix heatmap visualisation — mirrors heatmap_functions.R from PubMatrixR.
3
+
4
+ Provides overlap-percentage heatmaps with optional hierarchical clustering.
5
+ """
6
+
7
+ import logging
8
+ import warnings
9
+
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import pandas as pd
13
+ import seaborn as sns
14
+ from matplotlib.colors import LinearSegmentedColormap
15
+ from matplotlib.figure import Figure
16
+ from scipy.cluster.hierarchy import dendrogram, linkage
17
+ from scipy.spatial.distance import pdist
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Default red gradient matching R pheatmap palette
22
+ _RED_GRADIENT = ["#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#99000d"]
23
+
24
+
25
+ def _to_numeric_matrix(matrix) -> np.ndarray:
26
+ """Coerce input to a 2-D numeric numpy array."""
27
+ if isinstance(matrix, pd.DataFrame):
28
+ arr = matrix.values.astype(float)
29
+ elif isinstance(matrix, np.ndarray):
30
+ arr = matrix.astype(float)
31
+ else:
32
+ arr = np.array(matrix, dtype=float)
33
+
34
+ if arr.ndim != 2 or arr.shape[0] == 0 or arr.shape[1] == 0:
35
+ raise ValueError("matrix must be a non-empty 2-D array or DataFrame")
36
+
37
+ return arr
38
+
39
+
40
+ def _handle_na(arr: np.ndarray) -> np.ndarray:
41
+ """Replace NaN with 0, emitting a warning if any were found."""
42
+ nan_mask = np.isnan(arr)
43
+ if nan_mask.any():
44
+ positions = list(zip(*np.where(nan_mask)))
45
+ warnings.warn(
46
+ f"NA values found at positions {positions[:5]}{'...' if len(positions) > 5 else ''}. "
47
+ "Converting to 0.",
48
+ UserWarning,
49
+ stacklevel=3,
50
+ )
51
+ arr = arr.copy()
52
+ arr[nan_mask] = 0.0
53
+ return arr
54
+
55
+
56
+ def _overlap_percentage(arr: np.ndarray) -> np.ndarray:
57
+ """
58
+ Compute Jaccard-style overlap percentage for each cell.
59
+
60
+ overlap[i, j] = intersection / union * 100
61
+ where union = row_total[i] + col_total[j] - intersection
62
+ """
63
+ row_totals = arr.sum(axis=1, keepdims=True) # sum across columns per row
64
+ col_totals = arr.sum(axis=0, keepdims=True) # sum across rows per column
65
+ union = row_totals + col_totals - arr
66
+ with np.errstate(invalid="ignore", divide="ignore"):
67
+ pct = np.where(union > 0, arr / union * 100, 0.0)
68
+ return pct
69
+
70
+
71
+ def _clustered_order(arr: np.ndarray) -> list[int]:
72
+ """Return row indices reordered by Euclidean distance / average linkage."""
73
+ if arr.shape[0] < 2:
74
+ return list(range(arr.shape[0]))
75
+ if np.allclose(arr, arr[0]): # no variation — skip clustering
76
+ return list(range(arr.shape[0]))
77
+ dist = pdist(arr, metric="euclidean")
78
+ Z = linkage(dist, method="average")
79
+ dend = dendrogram(Z, no_plot=True)
80
+ return dend["leaves"]
81
+
82
+
83
+ def _auto_font_size(n_rows: int, n_cols: int) -> float:
84
+ """Scale annotation font size based on matrix dimensions."""
85
+ max_dim = max(n_rows, n_cols)
86
+ if max_dim <= 5:
87
+ return 10.0
88
+ elif max_dim <= 10:
89
+ return 8.0
90
+ elif max_dim <= 20:
91
+ return 6.0
92
+ else:
93
+ return 4.0
94
+
95
+
96
+ def plot_pubmatrix_heatmap(
97
+ matrix,
98
+ title: str = "PubMatrix Co-occurrence Heatmap",
99
+ cluster_rows: bool = True,
100
+ cluster_cols: bool = True,
101
+ show_numbers: bool = True,
102
+ color_palette: list[str] | None = None,
103
+ filename: str | None = None,
104
+ width: float = 10,
105
+ height: float = 8,
106
+ scale_font: bool = True,
107
+ show: bool = False,
108
+ ) -> tuple[Figure, plt.Axes]:
109
+ """
110
+ Create a publication-ready heatmap of PubMatrix co-occurrence results.
111
+
112
+ Cell values show overlap percentage: (intersection / union) × 100,
113
+ where union = row_total + col_total - intersection.
114
+
115
+ Parameters
116
+ ----------
117
+ matrix : DataFrame or array-like
118
+ PubMatrix result (rows = B terms, cols = A terms, values = counts).
119
+ title : str
120
+ Heatmap title.
121
+ cluster_rows, cluster_cols : bool
122
+ Apply Euclidean distance / average-linkage clustering.
123
+ show_numbers : bool
124
+ Annotate cells with overlap percentage values.
125
+ color_palette : list of str, optional
126
+ Custom hex color list for gradient. Defaults to red gradient.
127
+ filename : str, optional
128
+ Save to this path (PNG). If None, the figure is not saved automatically.
129
+ width, height : float
130
+ Figure size in inches.
131
+ scale_font : bool
132
+ Auto-scale annotation font size based on matrix dimensions.
133
+ show : bool
134
+ Call plt.show() after plotting (default False). Useful in interactive sessions.
135
+
136
+ Returns
137
+ -------
138
+ tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]
139
+ """
140
+ # --- Input handling ---
141
+ row_labels = list(matrix.index) if isinstance(matrix, pd.DataFrame) else None
142
+ col_labels = list(matrix.columns) if isinstance(matrix, pd.DataFrame) else None
143
+
144
+ arr = _to_numeric_matrix(matrix)
145
+ arr = _handle_na(arr)
146
+ pct = _overlap_percentage(arr)
147
+
148
+ n_rows, n_cols = arr.shape
149
+
150
+ # --- Clustering ---
151
+ row_order = _clustered_order(pct) if cluster_rows else list(range(n_rows))
152
+ col_order = _clustered_order(pct.T) if cluster_cols else list(range(n_cols))
153
+
154
+ pct_ordered = pct[np.ix_(row_order, col_order)]
155
+ row_labels_ordered = [row_labels[i] for i in row_order] if row_labels else row_order
156
+ col_labels_ordered = [col_labels[i] for i in col_order] if col_labels else col_order
157
+
158
+ # --- Color map ---
159
+ colors = color_palette or _RED_GRADIENT
160
+ cmap = LinearSegmentedColormap.from_list("pubmatrix", colors)
161
+
162
+ # --- Font size ---
163
+ annot_kws = {}
164
+ if scale_font:
165
+ annot_kws["size"] = _auto_font_size(n_rows, n_cols)
166
+
167
+ # --- Plot ---
168
+ fig, ax = plt.subplots(figsize=(width, height))
169
+ sns.heatmap(
170
+ pct_ordered,
171
+ ax=ax,
172
+ cmap=cmap,
173
+ annot=show_numbers,
174
+ fmt=".1f",
175
+ annot_kws=annot_kws or None,
176
+ xticklabels=col_labels_ordered,
177
+ yticklabels=row_labels_ordered,
178
+ linewidths=0.5,
179
+ linecolor="white",
180
+ cbar_kws={"label": "Overlap %"},
181
+ )
182
+
183
+ ax.set_title(title, pad=12)
184
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
185
+ ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
186
+ plt.tight_layout()
187
+
188
+ if filename:
189
+ fig.savefig(filename, dpi=150, bbox_inches="tight")
190
+ logger.info("Saved heatmap to %s", filename)
191
+
192
+ if show:
193
+ plt.show()
194
+
195
+ return fig, ax
196
+
197
+
198
+ def pubmatrix_heatmap(matrix, title: str = "PubMatrix Results") -> tuple[Figure, plt.Axes]:
199
+ """
200
+ Convenience wrapper for plot_pubmatrix_heatmap() with default parameters.
201
+
202
+ Parameters
203
+ ----------
204
+ matrix : DataFrame or array-like
205
+ PubMatrix result matrix.
206
+ title : str
207
+ Heatmap title.
208
+
209
+ Returns
210
+ -------
211
+ tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]
212
+ """
213
+ return plot_pubmatrix_heatmap(matrix, title=title)
@@ -0,0 +1,300 @@
1
+ Metadata-Version: 2.4
2
+ Name: pubmatrixpython
3
+ Version: 0.2.0
4
+ Summary: Python port of PubMatrixR — systematic literature co-occurrence analysis via NCBI PubMed
5
+ Project-URL: Homepage, https://toledoem.github.io/pubmatrixp/
6
+ Project-URL: Repository, https://github.com/ToledoEM/PubMatrixPython
7
+ Project-URL: Changelog, https://github.com/ToledoEM/PubMatrixPython/blob/main/CHANGELOG.md
8
+ Author-email: Enrique Toledo <enriquetoledo@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ License-File: LICENSE.md
12
+ Keywords: bioinformatics,co-occurrence,literature-mining,ncbi,pubmed
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: matplotlib<4,>=3.10
25
+ Requires-Dist: pandas<4,>=2.0
26
+ Requires-Dist: requests<3,>=2.33
27
+ Requires-Dist: scipy<2,>=1.10
28
+ Requires-Dist: seaborn<1,>=0.13
29
+ Requires-Dist: tqdm<5,>=4.60
30
+ Provides-Extra: ods
31
+ Requires-Dist: odfpy>=1.4.1; extra == 'ods'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # PubMatrixPython v0.2
35
+
36
+ <img src="https://toledoem.github.io/img/LogoPubmatrixP.png" align="right" width="150"/>
37
+
38
+ ![Python](https://img.shields.io/badge/python-3.10%2B-blue)
39
+ ![Tests](https://img.shields.io/badge/tests-60%20passed-brightgreen)
40
+ ![License](https://img.shields.io/badge/license-MIT-green)
41
+
42
+ Python port of the [PubMatrixR](https://github.com/ToledoEM/PubMatrixR-v2) R package.
43
+
44
+ For every pair of search terms `(A, B)`, it counts how many PubMed or PMC publications mention both. Good for mapping relationships between genes, diseases, and pathways across the literature.
45
+
46
+ Based on: Becker et al. (2003) *PubMatrix: a tool for multiplex literature mining*. BMC Bioinformatics 4:61. https://doi.org/10.1186/1471-2105-4-61
47
+
48
+ ---
49
+
50
+ ## Key features
51
+
52
+ - **Pairwise literature search** — automatically searches every combination of terms from two lists
53
+ - **PubMed or PMC** — query MEDLINE abstracts or PMC full text via NCBI E-utilities
54
+ - **Heatmap visualisation** — overlap-percentage heatmaps with optional hierarchical clustering
55
+ - **Export to CSV or ODS** — results include clickable hyperlinks to the matching PubMed search
56
+ - **Date filtering** — restrict searches to a publication year range
57
+ - **Flexible input** — pass term lists directly, or load them from a text file
58
+ - **Concurrency** — `n_workers` for parallel queries, respecting NCBI rate limits
59
+ - **Disk caching** — `cache_dir` persists query results between runs
60
+ - **Progress tracking** — built-in progress bar for long searches
61
+
62
+ ## Use cases
63
+
64
+ - **Gene–disease association studies** — explore literature connections between genes and diseases
65
+ - **Pathway analysis** — investigate co-occurrence of genes within or across biological pathways
66
+ - **Drug–target research** — analyse relationships between compounds and potential targets
67
+ - **Systematic literature reviews** — quantify research coverage across multiple topics
68
+ - **Knowledge gap identification** — find under-researched combinations of terms
69
+ - **Bibliometric analysis** — measure research activity in a domain over time
70
+
71
+ ---
72
+
73
+ ## Setup
74
+
75
+ Requires [uv](https://docs.astral.sh/uv/). Install it with:
76
+
77
+ ```bash
78
+ curl -LsSf https://astral.sh/uv/install.sh | sh
79
+ ```
80
+
81
+ Clone and install dependencies:
82
+
83
+ ```bash
84
+ git clone <repo-url>
85
+ cd PubMatrixPython
86
+ uv sync --all-groups
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Running the notebooks
92
+
93
+ All `uv` commands must be run from the **project root** (`PubMatrixPython/`), where `pyproject.toml` lives.
94
+
95
+ ```bash
96
+ cd /path/to/PubMatrixPython
97
+ uv run jupyter lab
98
+ ```
99
+
100
+ Then open any notebook from the `notebooks/` folder in the browser.
101
+
102
+ | Notebook | What it covers |
103
+ |----------|---------------|
104
+ | `01_pubmatrix.ipynb` | Basic queries, date filtering, PMC database, file input, CSV export, heatmap visualisation |
105
+ | `02_example_wnt.ipynb` | Full worked example: WNT genes × obesity genes |
106
+
107
+ ---
108
+
109
+ ## Quick start (script or REPL)
110
+
111
+ ### Interactive REPL
112
+
113
+ ```bash
114
+ uv run python
115
+ ```
116
+
117
+ ```python
118
+ from pubmatrix import pubmatrix, plot_pubmatrix_heatmap
119
+
120
+ A = ["WNT1", "WNT2", "CTNNB1"]
121
+ B = ["obesity", "diabetes", "cancer"]
122
+
123
+ result = pubmatrix(A=A, B=B)
124
+ print(result)
125
+
126
+ plot_pubmatrix_heatmap(result, title="WNT × Disease")
127
+ ```
128
+
129
+ ### Running a script
130
+
131
+ Create a file `my_analysis.py`:
132
+
133
+ ```python
134
+ from pubmatrix import pubmatrix, plot_pubmatrix_heatmap
135
+
136
+ A = ["WNT1", "WNT2", "WNT3A", "WNT5A", "CTNNB1"]
137
+ B = ["obesity", "diabetes", "cancer", "inflammation"]
138
+
139
+ result = pubmatrix(
140
+ A=A,
141
+ B=B,
142
+ database="pubmed",
143
+ daterange=[2010, 2024], # optional date filter
144
+ outfile="results",
145
+ export_format="csv", # saves results_result.csv with PubMed hyperlinks
146
+ )
147
+
148
+ print(result)
149
+
150
+ plot_pubmatrix_heatmap(
151
+ result,
152
+ title="WNT Genes × Disease",
153
+ filename="heatmap.png", # saves to file instead of displaying
154
+ )
155
+ ```
156
+
157
+ Run it with:
158
+
159
+ ```bash
160
+ uv run python my_analysis.py
161
+ ```
162
+
163
+ ### Loading terms from a file
164
+
165
+ Create `terms.txt`:
166
+
167
+ ```
168
+ WNT1
169
+ WNT2
170
+ CTNNB1
171
+ #
172
+ obesity
173
+ diabetes
174
+ cancer
175
+ ```
176
+
177
+ ```python
178
+ from pubmatrix import pubmatrix_from_file
179
+
180
+ result = pubmatrix_from_file("terms.txt")
181
+ print(result)
182
+ ```
183
+
184
+ ```bash
185
+ uv run python my_analysis.py
186
+ ```
187
+
188
+ ---
189
+
190
+ ## API reference
191
+
192
+ ### `pubmatrix(A, B, ...)`
193
+
194
+ Query PubMed and return a `pandas.DataFrame` (rows = B, cols = A).
195
+
196
+ ```python
197
+ pubmatrix(
198
+ A, # list of str — column terms
199
+ B, # list of str — row terms
200
+ api_key=None, # NCBI API key (10 req/s vs 3 req/s default)
201
+ database="pubmed", # "pubmed" or "pmc"
202
+ daterange=None, # e.g. [2015, 2024]
203
+ outfile=None, # base filename for export
204
+ export_format=None, # None | "csv" | "ods"
205
+ n_tries=2, # retries on network failure
206
+ n_workers=1, # parallel workers for concurrent queries
207
+ timeout=30, # HTTP request timeout in seconds
208
+ cache_dir=None, # directory to cache query results on disk
209
+ )
210
+ ```
211
+
212
+ ### `pubmatrix_from_file(filepath, ...)`
213
+
214
+ Load terms from a plain-text file and run `pubmatrix()`.
215
+
216
+ File format:
217
+ ```
218
+ WNT1
219
+ WNT2
220
+ #
221
+ obesity
222
+ diabetes
223
+ ```
224
+
225
+ ```python
226
+ result = pubmatrix_from_file("terms.txt", database="pubmed")
227
+ ```
228
+
229
+ ### `plot_pubmatrix_heatmap(matrix, ...)`
230
+
231
+ Heatmap of overlap percentages with optional hierarchical clustering. Returns `(fig, ax)`.
232
+
233
+ ```python
234
+ fig, ax = plot_pubmatrix_heatmap(
235
+ matrix, # DataFrame from pubmatrix()
236
+ title="PubMatrix Co-occurrence Heatmap",
237
+ cluster_rows=True,
238
+ cluster_cols=True,
239
+ show_numbers=True,
240
+ color_palette=None, # list of hex colours
241
+ filename=None, # save to PNG if set
242
+ width=10, height=8,
243
+ scale_font=True,
244
+ show=False, # call plt.show() after plotting
245
+ )
246
+ ```
247
+
248
+ ### `pubmatrix_heatmap(matrix, title=...)`
249
+
250
+ Quick wrapper around `plot_pubmatrix_heatmap()` with all defaults. Returns `(fig, ax)`.
251
+
252
+ ---
253
+
254
+ ## Output files
255
+
256
+ When `outfile` and `export_format` are set, results are written to
257
+ `{outfile}_result.{extension}` (`.csv` or `.ods`). Each cell contains the
258
+ publication count and a hyperlink to the matching PubMed search. Row names
259
+ come from `B`, column names from `A`.
260
+
261
+ ODS export requires the optional `odfpy` dependency:
262
+
263
+ ```bash
264
+ pip install pubmatrixpython[ods]
265
+ ```
266
+
267
+ ---
268
+
269
+ ## NCBI API key
270
+
271
+ Without a key: 3 requests/second. With a key: 10 requests/second.
272
+ Get one at https://account.ncbi.nlm.nih.gov/
273
+
274
+ ```python
275
+ result = pubmatrix(A=A, B=B, api_key="YOUR_KEY_HERE")
276
+ ```
277
+
278
+ ---
279
+
280
+ ## More documentation
281
+
282
+ - [Performance notes](docs/performance.md) — rate limits, caching, concurrency
283
+ - [Troubleshooting](docs/troubleshooting.md) — empty results, rate limiting, slow searches
284
+ - [Full reference notebook](https://toledoem.github.io/pubmatrixp/) — every parameter and feature, with output
285
+
286
+ ---
287
+
288
+ ## License & citation
289
+
290
+ This project is licensed under the MIT License — see [`LICENSE.md`](LICENSE.md).
291
+
292
+ If you use PubMatrixPython in your research, please cite:
293
+
294
+ > Becker KG, Hosack DA, Dennis G Jr, Lempicki RA, Bright TJ, Cheadle C, Engel J.
295
+ > *PubMatrix: a tool for multiplex literature mining.*
296
+ > BMC Bioinformatics. 2003 Dec 10;4:61. https://doi.org/10.1186/1471-2105-4-61
297
+
298
+ **Developers:**
299
+ - Tyler Laird (Author, original PubMatrixR)
300
+ - Enrique Toledo (Author, maintainer)
@@ -0,0 +1,8 @@
1
+ pubmatrix/__init__.py,sha256=2R1IJspkRVVX9LX9iN_fN3gvbCdwl0NoTaG7AEkaWJE,226
2
+ pubmatrix/core.py,sha256=PuR_u7vF2A-5Em08u6YbTQ9J9XwCOaRK7xEf0ufEP-I,13478
3
+ pubmatrix/heatmap.py,sha256=EB2Bw6y3U2YdXRN8VNjhwB59__jXdDt5fPnV6PFzm1U,6733
4
+ pubmatrixpython-0.2.0.dist-info/METADATA,sha256=B8GoCZ6n-auPVMiYXI4XHkx4mpLM85RH0NzmJClTdxE,8689
5
+ pubmatrixpython-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
6
+ pubmatrixpython-0.2.0.dist-info/licenses/LICENSE,sha256=d2_z5YBmmkX6hPR-WEPUp5r2bCINz-6H6fl108AlOck,44
7
+ pubmatrixpython-0.2.0.dist-info/licenses/LICENSE.md,sha256=9hUAiG3FYIg0qkm15NoR-OYK9qV5ypHZvaFolJY0tXA,1073
8
+ pubmatrixpython-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ YEAR: 2026
2
+ COPYRIGHT HOLDER: Enrique Toledo
@@ -0,0 +1,21 @@
1
+ # MIT License
2
+
3
+ Copyright (c) 2026 Enrique Toledo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.