PyPI - pubnetwork - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pubnetwork 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pubnet/__init__.py +3 -0
pubnet/analyze.py +391 -0
pubnet/cli.py +255 -0
pubnet/crossref.py +180 -0
pubnet/data/demo.json +457 -0
pubnet/data/scimago.csv +4653 -0
pubnet/fetch.py +277 -0
pubnet/formatters.py +253 -0
pubnet/gui/__init__.py +1 -0
pubnet/gui/app.py +50 -0
pubnet/gui/assets/style.css +331 -0
pubnet/gui/assets/toggle_refs.js +19 -0
pubnet/gui/callbacks.py +478 -0
pubnet/gui/components/__init__.py +1 -0
pubnet/gui/components/clusters.py +87 -0
pubnet/gui/components/network.py +145 -0
pubnet/gui/components/pub_table.py +164 -0
pubnet/gui/components/pubs_per_year.py +61 -0
pubnet/gui/components/stat_cards.py +46 -0
pubnet/gui/components/trends.py +82 -0
pubnet/gui/layouts.py +511 -0
pubnet/journal_if.py +232 -0
pubnet/models.py +137 -0
pubnet/report.py +388 -0
pubnet/templates/report.html +730 -0
pubnetwork-0.1.0.dist-info/METADATA +144 -0
pubnetwork-0.1.0.dist-info/RECORD +30 -0
pubnetwork-0.1.0.dist-info/WHEEL +4 -0
pubnetwork-0.1.0.dist-info/entry_points.txt +2 -0
pubnetwork-0.1.0.dist-info/licenses/LICENSE +139 -0

pubnet/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""PubNet — Publication network analyser for researchers."""
+__version__ = "0.1.0"

pubnet/analyze.py ADDED Viewed

@@ -0,0 +1,391 @@
+"""Analysis modules for PubNet.
+Each public function is a pure function:
+    (list[Publication], **config) → AnalysisResult
+Modules:
+    - clean_publications: dedup, null-fill, normalise
+    - build_coauthor_graph: network analysis
+    - compute_citation_trends: yearly aggregation + rolling h-index
+    - cluster_topics: TF-IDF + k-means
+    - compute_stats: summary statistics
+"""
+from __future__ import annotations
+import logging
+import re
+from collections import Counter
+from pubnet.models import (
+    Author,
+    CitationTrends,
+    CitationYear,
+    CoauthorEdge,
+    CoauthorGraph,
+    CoauthorNode,
+    Publication,
+    StatsSummary,
+    TopicAnalysis,
+    TopicCluster,
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Data cleaning
+# ---------------------------------------------------------------------------
+def clean_publications(publications: list[Publication]) -> list[Publication]:
+    """Clean and deduplicate a list of publications.
+    Steps:
+        1. Fuzzy title dedup (rapidfuzz, threshold 90)
+        2. Null-fill missing years/venues
+        3. Normalise author names
+        4. Sort by year descending (most recent first)
+    """
+    pubs = [p.model_copy() for p in publications]  # don't mutate originals
+    pubs = _dedup_titles(pubs)
+    pubs = _fill_missing(pubs)
+    pubs = _normalise_authors(pubs)
+    pubs.sort(key=lambda p: (p.year or 0, p.citations), reverse=True)
+    return pubs
+def _dedup_titles(pubs: list[Publication]) -> list[Publication]:
+    """Remove near-duplicate publications by fuzzy title matching.
+    When two titles are >90% similar, keep the one with more citations.
+    """
+    try:
+        from rapidfuzz import fuzz
+    except ImportError:
+        logger.warning("rapidfuzz not installed — skipping dedup")
+        return pubs
+    if len(pubs) <= 1:
+        return pubs
+    keep = []
+    removed_indices: set[int] = set()
+    for i, pub_a in enumerate(pubs):
+        if i in removed_indices:
+            continue
+        best = pub_a
+        for j in range(i + 1, len(pubs)):
+            if j in removed_indices:
+                continue
+            pub_b = pubs[j]
+            ratio = fuzz.ratio(
+                _normalise_title(pub_a.title),
+                _normalise_title(pub_b.title),
+            )
+            if ratio > 90:
+                # Keep the one with more citations
+                if pub_b.citations > best.citations:
+                    best = pub_b
+                removed_indices.add(j)
+                logger.debug("Dedup: %r ≈ %r (%.0f%%)", pub_a.title, pub_b.title, ratio)
+        keep.append(best)
+    if removed_indices:
+        logger.info("Dedup removed %d duplicate(s) from %d publications", len(removed_indices), len(pubs))
+    return keep
+def _normalise_title(title: str) -> str:
+    """Lowercase, strip punctuation for fuzzy comparison."""
+    return re.sub(r"[^a-z0-9\s]", "", title.lower()).strip()
+def _fill_missing(pubs: list[Publication]) -> list[Publication]:
+    """Fill None values with sensible defaults."""
+    result = []
+    for pub in pubs:
+        updates = {}
+        if pub.venue is None:
+            updates["venue"] = "Unknown"
+        # Strip whitespace from venue
+        if pub.venue and pub.venue.strip() == "":
+            updates["venue"] = "Unknown"
+        # Year stays None — analysis modules handle it
+        if updates:
+            pub = pub.model_copy(update=updates)
+        result.append(pub)
+    return result
+def _normalise_authors(pubs: list[Publication]) -> list[Publication]:
+    """Normalise author name formats for consistency.
+    Strips extra whitespace. More aggressive normalisation (e.g., merging
+    "J. Smith" and "John Smith") would need a name-matching heuristic that
+    risks false positives, so we keep it simple for now.
+    """
+    result = []
+    for pub in pubs:
+        cleaned = [_clean_author_name(a) for a in pub.authors if a.strip()]
+        if cleaned != pub.authors:
+            pub = pub.model_copy(update={"authors": cleaned})
+        result.append(pub)
+    return result
+def _clean_author_name(name: str) -> str:
+    """Clean up a single author name."""
+    # Collapse whitespace
+    name = re.sub(r"\s+", " ", name).strip()
+    # Remove trailing/leading punctuation
+    name = name.strip(".,;:")
+    return name
+# ---------------------------------------------------------------------------
+# Co-author graph
+# ---------------------------------------------------------------------------
+def build_coauthor_graph(
+    author: Author,
+    publications: list[Publication],
+) -> CoauthorGraph:
+    """Build a co-author network graph.
+    Nodes are people. Edges connect anyone who co-authored at least one paper.
+    Edge weight = number of shared papers.
+    """
+    ego_name = author.name
+    edge_map: dict[tuple[str, str], list[str]] = {}
+    author_papers: Counter[str] = Counter()
+    author_citations: Counter[str] = Counter()
+    for pub in publications:
+        authors = pub.authors
+        if not authors:
+            continue
+        for a in authors:
+            author_papers[a] += 1
+            author_citations[a] += pub.citations
+        # Build edges between ego and each co-author
+        for a in authors:
+            if a == ego_name:
+                continue
+            key = tuple(sorted([ego_name, a]))
+            if key not in edge_map:
+                edge_map[key] = []
+            edge_map[key].append(pub.title)
+        # Build edges between co-authors (not just ego-centric)
+        non_ego = [a for a in authors if a != ego_name]
+        for i, a in enumerate(non_ego):
+            for b in non_ego[i + 1:]:
+                key = tuple(sorted([a, b]))
+                if key not in edge_map:
+                    edge_map[key] = []
+                edge_map[key].append(pub.title)
+    # Build node list
+    all_authors = set()
+    for a, b in edge_map:
+        all_authors.add(a)
+        all_authors.add(b)
+    all_authors.add(ego_name)
+    nodes = [
+        CoauthorNode(
+            name=name,
+            paper_count=author_papers.get(name, 0),
+            total_citations=author_citations.get(name, 0),
+            is_ego=(name == ego_name),
+        )
+        for name in sorted(all_authors)
+    ]
+    edges = [
+        CoauthorEdge(source=a, target=b, weight=len(papers), papers=papers)
+        for (a, b), papers in edge_map.items()
+    ]
+    # Compute average co-authors per paper
+    coauthor_counts = [len(p.authors) - 1 for p in publications if len(p.authors) > 1]
+    avg = sum(coauthor_counts) / len(coauthor_counts) if coauthor_counts else 0.0
+    return CoauthorGraph(
+        nodes=nodes,
+        edges=edges,
+        total_coauthors=len(all_authors) - 1,  # exclude ego
+        avg_coauthors_per_paper=round(avg, 1),
+    )
+# ---------------------------------------------------------------------------
+# Citation trends
+# ---------------------------------------------------------------------------
+def compute_citation_trends(publications: list[Publication]) -> CitationTrends:
+    """Aggregate citations and publications by year, with rolling h-index."""
+    pubs_with_year = [p for p in publications if p.year is not None]
+    if not pubs_with_year:
+        return CitationTrends()
+    years = sorted({p.year for p in pubs_with_year})
+    first_year, last_year = years[0], years[-1]
+    yearly = []
+    for year in range(first_year, last_year + 1):
+        year_pubs = [p for p in pubs_with_year if p.year == year]
+        cumulative_pubs = [p for p in pubs_with_year if p.year <= year]
+        yearly.append(CitationYear(
+            year=year,
+            citation_count=sum(p.citations for p in year_pubs),
+            publication_count=len(year_pubs),
+            cumulative_h_index=_compute_h_index(cumulative_pubs),
+        ))
+    return CitationTrends(
+        yearly=yearly,
+        first_year=first_year,
+        last_year=last_year,
+    )
+def _compute_h_index(publications: list[Publication]) -> int:
+    """Compute h-index: largest h such that h papers have ≥ h citations."""
+    cites = sorted([p.citations for p in publications], reverse=True)
+    h = 0
+    for i, c in enumerate(cites):
+        if c >= i + 1:
+            h = i + 1
+        else:
+            break
+    return h
+# ---------------------------------------------------------------------------
+# Topic clustering
+# ---------------------------------------------------------------------------
+def cluster_topics(
+    publications: list[Publication],
+    num_clusters: int = 5,
+) -> TopicAnalysis:
+    """Cluster publications by topic using TF-IDF + k-means.
+    Falls back gracefully to title-only if abstracts are missing.
+    """
+    try:
+        from sklearn.cluster import KMeans
+        from sklearn.feature_extraction.text import TfidfVectorizer
+    except ImportError:
+        logger.warning("scikit-learn not installed — skipping topic clustering")
+        return TopicAnalysis()
+    if len(publications) < num_clusters:
+        num_clusters = max(1, len(publications))
+    # Build text corpus: title + abstract (or title only)
+    corpus = []
+    valid_indices = []
+    for i, pub in enumerate(publications):
+        text = pub.title
+        if pub.abstract:
+            text = f"{pub.title}. {pub.abstract}"
+        if text.strip():
+            corpus.append(text)
+            valid_indices.append(i)
+    if len(corpus) < 2:
+        return TopicAnalysis()
+    # Adjust num_clusters if we have fewer documents
+    num_clusters = min(num_clusters, len(corpus))
+    vectorizer = TfidfVectorizer(
+        max_features=500,
+        stop_words="english",
+        max_df=0.85,
+        min_df=1,
+    )
+    tfidf_matrix = vectorizer.fit_transform(corpus)
+    feature_names = vectorizer.get_feature_names_out()
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(tfidf_matrix)
+    # Extract top keywords per cluster from centroids
+    clusters = []
+    for cid in range(num_clusters):
+        centroid = kmeans.cluster_centers_[cid]
+        top_indices = centroid.argsort()[-5:][::-1]
+        keywords = [feature_names[idx] for idx in top_indices]
+        pub_indices = [valid_indices[i] for i, label in enumerate(labels) if label == cid]
+        total_cites = sum(publications[idx].citations for idx in pub_indices)
+        clusters.append(TopicCluster(
+            cluster_id=cid,
+            keywords=keywords,
+            publication_indices=pub_indices,
+            total_citations=total_cites,
+            publication_count=len(pub_indices),
+        ))
+    return TopicAnalysis(clusters=clusters, num_clusters=num_clusters)
+# ---------------------------------------------------------------------------
+# Summary statistics
+# ---------------------------------------------------------------------------
+def compute_stats(
+    author: Author,
+    publications: list[Publication],
+    impact_factors: dict[str, float | None] | None = None,
+) -> StatsSummary:
+    """Compute summary statistics for the profile."""
+    years = [p.year for p in publications if p.year is not None]
+    first_year = min(years) if years else None
+    last_year = max(years) if years else None
+    # Top venue by publication count
+    venue_counts = Counter(p.venue for p in publications if p.venue and p.venue != "Unknown")
+    top_venue, top_venue_count = venue_counts.most_common(1)[0] if venue_counts else (None, 0)
+    # Unique co-authors
+    all_coauthors = set()
+    for pub in publications:
+        for a in pub.authors:
+            if a != author.name:
+                all_coauthors.add(a)
+    # Average impact factor (from enriched data)
+    avg_if = None
+    if impact_factors:
+        known_ifs = [v for v in impact_factors.values() if v is not None]
+        if known_ifs:
+            avg_if = round(sum(known_ifs) / len(known_ifs), 1)
+    total_cites = sum(p.citations for p in publications)
+    years_str = ""
+    if first_year and last_year:
+        years_str = f"{first_year}–{last_year}" if first_year != last_year else str(first_year)
+    return StatsSummary(
+        total_publications=len(publications),
+        total_citations=total_cites,
+        h_index=author.h_index or _compute_h_index(publications),
+        i10_index=author.i10_index or sum(1 for p in publications if p.citations >= 10),
+        years_active=years_str,
+        first_pub_year=first_year,
+        last_pub_year=last_year,
+        top_venue=top_venue,
+        top_venue_count=top_venue_count,
+        unique_coauthors=len(all_coauthors),
+        avg_impact_factor=avg_if,
+        avg_citations_per_paper=round(total_cites / len(publications), 1) if publications else 0.0,
+    )

pubnet/cli.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""PubNet CLI — publication network analyser.
+Entry points:
+    pubnet analyze --scholar-url <url>
+    pubnet analyze --builtin
+    pubnet demo
+    pubnet gui
+    pubnet cache list | clear
+"""
+from __future__ import annotations
+import json
+import sys
+import logging
+import click
+from pubnet import __version__
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+def _setup_logging(verbose: bool) -> None:
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(levelname)s: %(message)s",
+    )
+# ---------------------------------------------------------------------------
+# Main group
+# ---------------------------------------------------------------------------
+@click.group()
+@click.version_option(__version__, prog_name="pubnet")
+def main():
+    """PubNet — Publication network analyser for researchers."""
+# ---------------------------------------------------------------------------
+# analyze command
+# ---------------------------------------------------------------------------
+@main.command()
+@click.option("--scholar-url", default=None, help="Google Scholar profile URL.")
+@click.option("--author-id", default=None, help="Google Scholar author ID.")
+@click.option("--builtin", "use_builtin", is_flag=True, help="Use bundled demo profile.")
+@click.option("--format", "ref_format", default="apa", type=click.Choice(["apa", "mla", "bibtex", "vancouver", "chicago"], case_sensitive=False), help="Reference format.")
+@click.option("--topics", default=5, type=int, help="Number of topic clusters.")
+@click.option("--output", "-o", default=None, type=click.Path(), help="Output HTML path.")
+@click.option("--no-cache", is_flag=True, help="Force fresh Scholar fetch.")
+@click.option("--crossref/--no-crossref", default=True, help="Enrich via Crossref API (corrects venue names, adds DOIs).")
+@click.option("--verbose", "-v", is_flag=True, help="Enable debug logging.")
+def analyze(scholar_url, author_id, use_builtin, ref_format, topics, output, no_cache, crossref, verbose):
+    """Analyse a Scholar profile and generate an HTML report."""
+    _setup_logging(verbose)
+    from pubnet.fetch import fetch_profile, load_demo, FetchError
+    from pubnet.analyze import (
+        clean_publications,
+        build_coauthor_graph,
+        compute_citation_trends,
+        cluster_topics,
+        compute_stats,
+    )
+    from pubnet.formatters import format_reference
+    # --- Resolve data source ---
+    if use_builtin:
+        click.echo("Loading built-in demo profile...")
+        author = load_demo()
+    elif scholar_url:
+        click.echo("Fetching profile: " + scholar_url)
+        try:
+            author = fetch_profile(scholar_url, use_cache=not no_cache)
+        except FetchError as exc:
+            click.echo("Error: " + str(exc), err=True)
+            sys.exit(1)
+    elif author_id:
+        click.echo("Fetching profile: " + author_id)
+        try:
+            author = fetch_profile(author_id, use_cache=not no_cache)
+        except FetchError as exc:
+            click.echo("Error: " + str(exc), err=True)
+            sys.exit(1)
+    else:
+        click.echo("Error: provide --scholar-url, --author-id, or --builtin", err=True)
+        sys.exit(1)
+    # --- Clean ---
+    pubs = clean_publications(author.publications)
+    click.echo("Loaded %d publications for %s" % (len(pubs), author.name))
+    # --- Crossref enrichment (corrects venue names, adds DOIs) ---
+    if crossref:
+        from pubnet.crossref import enrich_publications as crossref_enrich
+        click.echo("Enriching via Crossref API (corrects venue names, adds DOIs)...")
+        cr_results = crossref_enrich(pubs, max_lookups=None)
+        corrections = 0
+        for idx, cr in cr_results.items():
+            if cr.venue_corrected and pubs[idx].venue:
+                old = pubs[idx].venue
+                if old != cr.venue_corrected and len(cr.venue_corrected) > 3:
+                    pubs[idx].venue = cr.venue_corrected
+                    corrections += 1
+        if corrections:
+            click.echo("  Corrected %d venue names via Crossref" % corrections)
+    # --- Journal IF lookup ---
+    from pubnet.journal_if import JournalIFLookup
+    click.echo("Looking up journal impact factors...")
+    if_lookup = JournalIFLookup()
+    impact_factors = if_lookup.enrich_publications(pubs)
+    # --- Analyse ---
+    click.echo("Running analysis...")
+    graph = build_coauthor_graph(author, pubs)
+    trends = compute_citation_trends(pubs)
+    topic_result = cluster_topics(pubs, num_clusters=topics)
+    stats = compute_stats(author, pubs, impact_factors=impact_factors)
+    # --- Print summary ---
+    click.echo()
+    click.echo("  " + author.name)
+    if author.affiliation:
+        click.echo("  " + author.affiliation)
+    click.echo("  " + "-" * 39)
+    click.echo("  Publications:      %d" % stats.total_publications)
+    click.echo("  Total citations:   %d" % stats.total_citations)
+    click.echo("  h-index:           %d" % stats.h_index)
+    click.echo("  i10-index:         %d" % stats.i10_index)
+    click.echo("  Years active:      %s" % stats.years_active)
+    click.echo("  Co-authors:        %d" % stats.unique_coauthors)
+    click.echo("  Top venue:         %s (%d pubs)" % (stats.top_venue, stats.top_venue_count))
+    click.echo("  Avg cites/paper:   %s" % stats.avg_citations_per_paper)
+    click.echo()
+    # --- Top publications ---
+    click.echo("  Top publications by citations:")
+    for pub in sorted(pubs, key=lambda p: p.citations, reverse=True)[:5]:
+        click.echo("    [%4d cites] %s" % (pub.citations, pub.title[:70]))
+        click.echo("              %s, %s" % (pub.venue or "Unknown", pub.year or "n.d."))
+    click.echo()
+    # --- Topic clusters ---
+    if topic_result.clusters:
+        click.echo("  Topic clusters (%d):" % topic_result.num_clusters)
+        for cluster in topic_result.clusters:
+            kw = ", ".join(cluster.keywords[:3])
+            click.echo("    Cluster %d: %s (%d pubs, %d cites)" % (
+                cluster.cluster_id, kw, cluster.publication_count, cluster.total_citations))
+        click.echo()
+    # --- Sample reference ---
+    if pubs:
+        top_pub = max(pubs, key=lambda p: p.citations)
+        click.echo("  Sample reference (%s):" % ref_format.upper())
+        click.echo("    %s" % format_reference(top_pub, style=ref_format))
+        click.echo()
+    # --- HTML report ---
+    from pubnet.report import render_report
+    from pathlib import Path
+    if not output:
+        safe_name = author.name.lower().replace(" ", "_")
+        output = "%s_pubnet.html" % safe_name
+    click.echo("  Generating HTML report -> %s" % output)
+    html = render_report(
+        author=author,
+        publications=pubs,
+        stats=stats,
+        coauthor_graph=graph,
+        citation_trends=trends,
+        topic_analysis=topic_result,
+        impact_factors=impact_factors,
+    )
+    Path(output).write_text(html, encoding="utf-8")
+    click.echo("  Done! Report saved to %s" % output)
+# ---------------------------------------------------------------------------
+# demo shortcut
+# ---------------------------------------------------------------------------
+@main.command()
+@click.option("--verbose", "-v", is_flag=True, help="Enable debug logging.")
+@click.pass_context
+def demo(ctx, verbose):
+    """Quick demo using the bundled profile (shortcut for analyze --builtin)."""
+    ctx.invoke(analyze, use_builtin=True, verbose=verbose)
+# ---------------------------------------------------------------------------
+# gui command
+# ---------------------------------------------------------------------------
+@main.command()
+@click.option("--port", default=8050, type=int, help="Server port.")
+@click.option("--scholar-url", default=None, help="Pre-load a Scholar profile URL.")
+@click.option("--debug", is_flag=True, help="Enable Dash debug mode.")
+@click.option("--verbose", "-v", is_flag=True, help="Enable debug logging.")
+def gui(port, scholar_url, debug, verbose):
+    """Launch the interactive Dash GUI."""
+    _setup_logging(verbose)
+    click.echo("Starting PubNet GUI on http://localhost:%d" % port)
+    from pubnet.gui.app import create_app
+    app = create_app(scholar_url=scholar_url)
+    app.run(port=port, debug=debug)
+# ---------------------------------------------------------------------------
+# cache commands
+# ---------------------------------------------------------------------------
+@main.group()
+def cache():
+    """Manage cached Scholar profiles."""
+@cache.command("list")
+def cache_list():
+    """List cached profiles."""
+    from pubnet.fetch import list_cached_profiles
+    profiles = list_cached_profiles()
+    if not profiles:
+        click.echo("No cached profiles.")
+        return
+    for p in profiles:
+        click.echo("  %s  %s  (%d pubs)" % (p["scholar_id"], p["name"], p["publications"]))
+@cache.command("clear")
+@click.confirmation_option(prompt="Delete all cached profiles?")
+def cache_clear():
+    """Clear all cached profiles."""
+    from pubnet.fetch import clear_cache
+    count = clear_cache()
+    click.echo("Removed %d cached profile(s)." % count)
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    main()