PyPI - crossref-local - Versions diffs - 0.3.1__py3-none-any.whl - Mend

crossref-local 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

crossref_local/__init__.py +128 -0
crossref_local/__main__.py +6 -0
crossref_local/aio.py +236 -0
crossref_local/api.py +221 -0
crossref_local/citations.py +413 -0
crossref_local/cli.py +450 -0
crossref_local/config.py +171 -0
crossref_local/db.py +138 -0
crossref_local/fts.py +172 -0
crossref_local/impact_factor/__init__.py +20 -0
crossref_local/impact_factor/calculator.py +479 -0
crossref_local/impact_factor/journal_lookup.py +274 -0
crossref_local/mcp_server.py +202 -0
crossref_local/models.py +186 -0
crossref_local/remote.py +264 -0
crossref_local/server.py +352 -0
crossref_local-0.3.1.dist-info/METADATA +306 -0
crossref_local-0.3.1.dist-info/RECORD +20 -0
crossref_local-0.3.1.dist-info/WHEEL +4 -0
crossref_local-0.3.1.dist-info/entry_points.txt +3 -0

crossref_local/impact_factor/journal_lookup.py ADDED Viewed

@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""
+Journal lookup module for fast name-to-ISSN resolution.
+Uses OpenAlex journals table (222k journals with IF proxy) for fast lookups.
+Falls back to direct database query if table doesn't exist.
+"""
+import json
+import sqlite3
+from typing import Dict, List, Optional
+import logging
+logger = logging.getLogger(__name__)
+class JournalLookup:
+    """
+    Fast journal name to ISSN lookup.
+    Uses journals_openalex table for O(1) lookups with IF proxy data.
+    Falls back to slow works table scan if OpenAlex table doesn't exist.
+    """
+    def __init__(self, db_path: str):
+        """
+        Initialize journal lookup.
+        Args:
+            db_path: Path to CrossRef SQLite database
+        """
+        self.db_path = db_path
+        self.conn = sqlite3.connect(db_path)
+        self.conn.row_factory = sqlite3.Row
+        self._openalex_exists = self._check_openalex_table()
+        self._issn_lookup_exists = self._check_issn_lookup_table()
+        if self._openalex_exists:
+            logger.info("Using journals_openalex table for fast lookups")
+        else:
+            logger.warning(
+                "journals_openalex table not found. "
+                "Run download_openalex_journals.py for fast lookups. "
+                "Falling back to slow query."
+            )
+    def _check_openalex_table(self) -> bool:
+        """Check if OpenAlex journals table exists."""
+        cursor = self.conn.execute("""
+            SELECT name FROM sqlite_master
+            WHERE type='table' AND name='journals_openalex'
+        """)
+        return cursor.fetchone() is not None
+    def _check_issn_lookup_table(self) -> bool:
+        """Check if ISSN lookup table exists."""
+        cursor = self.conn.execute("""
+            SELECT name FROM sqlite_master
+            WHERE type='table' AND name='issn_lookup'
+        """)
+        return cursor.fetchone() is not None
+    def get_issn(self, journal_name: str, strict: bool = True) -> Optional[str]:
+        """
+        Get ISSN for a journal name.
+        Args:
+            journal_name: Journal name (case-insensitive)
+            strict: If True, only exact matches. If False, allow partial matches.
+        Returns:
+            ISSN string or None if not found
+        """
+        if self._openalex_exists:
+            return self._get_issn_openalex(journal_name, strict)
+        else:
+            return self._get_issn_slow(journal_name, strict)
+    def _get_issn_openalex(self, journal_name: str, strict: bool = True) -> Optional[str]:
+        """Fast lookup using OpenAlex journals table."""
+        # Try exact match first
+        cursor = self.conn.execute("""
+            SELECT issn_l FROM journals_openalex
+            WHERE name_lower = ?
+            LIMIT 1
+        """, (journal_name.lower(),))
+        result = cursor.fetchone()
+        if result and result[0]:
+            return result[0]
+        # If strict mode, don't try partial match
+        if strict:
+            logger.debug(f"Strict mode: no exact match for '{journal_name}'")
+            return None
+        # Try partial match (only if not strict)
+        logger.warning(f"Using partial match for '{journal_name}' - results may be inaccurate")
+        cursor = self.conn.execute("""
+            SELECT issn_l, name FROM journals_openalex
+            WHERE name_lower LIKE ?
+            ORDER BY works_count DESC
+            LIMIT 1
+        """, (f"%{journal_name.lower()}%",))
+        result = cursor.fetchone()
+        if result and result[0]:
+            logger.warning(f"  Matched to: '{result[1]}'")
+            return result[0]
+        return None
+    def _get_issn_slow(self, journal_name: str, strict: bool = True) -> Optional[str]:
+        """Slow lookup by scanning works table."""
+        if strict:
+            # Exact match
+            cursor = self.conn.execute("""
+                SELECT DISTINCT json_extract(metadata, '$.ISSN[0]') as issn
+                FROM works
+                WHERE json_extract(metadata, '$.container-title[0]') = ?
+                AND json_extract(metadata, '$.ISSN[0]') IS NOT NULL
+                LIMIT 1
+            """, (journal_name,))
+        else:
+            # Partial match
+            cursor = self.conn.execute("""
+                SELECT DISTINCT json_extract(metadata, '$.ISSN[0]') as issn
+                FROM works
+                WHERE json_extract(metadata, '$.container-title[0]') LIKE ?
+                AND json_extract(metadata, '$.ISSN[0]') IS NOT NULL
+                LIMIT 1
+            """, (f"%{journal_name}%",))
+        result = cursor.fetchone()
+        return result[0] if result else None
+    def search(self, query: str, limit: int = 10) -> List[Dict]:
+        """
+        Search for journals by name.
+        Args:
+            query: Search query (partial name match)
+            limit: Maximum results to return
+        Returns:
+            List of journal info dictionaries with IF proxy
+        """
+        if not self._openalex_exists:
+            return []
+        cursor = self.conn.execute("""
+            SELECT name, issn_l, publisher, works_count,
+                   two_year_mean_citedness, h_index
+            FROM journals_openalex
+            WHERE name_lower LIKE ?
+            ORDER BY works_count DESC
+            LIMIT ?
+        """, (f"%{query.lower()}%", limit))
+        return [
+            {
+                "name": row["name"],
+                "issn": row["issn_l"],
+                "publisher": row["publisher"],
+                "works_count": row["works_count"],
+                "if_proxy": row["two_year_mean_citedness"],
+                "h_index": row["h_index"]
+            }
+            for row in cursor.fetchall()
+        ]
+    def get_info(self, issn: str) -> Optional[Dict]:
+        """
+        Get journal info by ISSN.
+        Args:
+            issn: Journal ISSN
+        Returns:
+            Journal info dictionary with IF proxy or None
+        """
+        if not self._openalex_exists:
+            return None
+        # Try direct ISSN-L match
+        cursor = self.conn.execute("""
+            SELECT name, issn_l, issns, publisher, works_count,
+                   two_year_mean_citedness, h_index, is_oa
+            FROM journals_openalex
+            WHERE issn_l = ?
+            LIMIT 1
+        """, (issn,))
+        row = cursor.fetchone()
+        # If not found, try issn_lookup table
+        if not row and self._issn_lookup_exists:
+            cursor = self.conn.execute("""
+                SELECT jo.name, jo.issn_l, jo.issns, jo.publisher, jo.works_count,
+                       jo.two_year_mean_citedness, jo.h_index, jo.is_oa
+                FROM issn_lookup il
+                JOIN journals_openalex jo ON il.journal_id = jo.id
+                WHERE il.issn = ?
+                LIMIT 1
+            """, (issn,))
+            row = cursor.fetchone()
+        if row:
+            issns = []
+            if row["issns"]:
+                try:
+                    issns = json.loads(row["issns"])
+                except:
+                    pass
+            return {
+                "name": row["name"],
+                "issn": row["issn_l"],
+                "issns": issns,
+                "publisher": row["publisher"],
+                "works_count": row["works_count"],
+                "if_proxy": row["two_year_mean_citedness"],
+                "h_index": row["h_index"],
+                "is_oa": row["is_oa"]
+            }
+        return None
+    def get_if_proxy(self, journal_name: str, strict: bool = True) -> Optional[float]:
+        """
+        Get OpenAlex Impact Factor proxy for a journal.
+        Args:
+            journal_name: Journal name
+            strict: If True, only exact matches
+        Returns:
+            2-year mean citedness (IF proxy) or None
+        """
+        if not self._openalex_exists:
+            return None
+        # Try exact match
+        cursor = self.conn.execute("""
+            SELECT two_year_mean_citedness FROM journals_openalex
+            WHERE name_lower = ?
+            LIMIT 1
+        """, (journal_name.lower(),))
+        result = cursor.fetchone()
+        if result and result[0]:
+            return result[0]
+        if strict:
+            return None
+        # Try partial match (only if not strict)
+        cursor = self.conn.execute("""
+            SELECT two_year_mean_citedness FROM journals_openalex
+            WHERE name_lower LIKE ?
+            ORDER BY works_count DESC
+            LIMIT 1
+        """, (f"%{journal_name.lower()}%",))
+        result = cursor.fetchone()
+        return result[0] if result and result[0] else None
+    def close(self):
+        """Close database connection."""
+        if self.conn:
+            self.conn.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()

crossref_local/mcp_server.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""MCP server for CrossRef Local - Claude integration.
+This server exposes crossref-local functionality as MCP tools,
+enabling Claude Desktop and other MCP clients to search academic papers.
+Usage:
+    crossref-local serve                    # stdio (Claude Desktop)
+    crossref-local serve -t http --port 8082  # HTTP transport
+    crossref-local-mcp                      # Direct entry point
+"""
+import json
+from typing import Optional
+from fastmcp import FastMCP
+from . import search, get, count, info, __version__
+from .impact_factor import ImpactFactorCalculator
+# Initialize MCP server
+mcp = FastMCP(
+    name="crossref-local",
+    instructions="Local CrossRef database with 167M+ works and full-text search. "
+    "Use search_works to find papers, get_work for DOI lookup, count_works for counts, "
+    "database_info for stats, and calculate_impact_factor for journal metrics.",
+)
+@mcp.tool()
+def search_works(
+    query: str,
+    limit: int = 10,
+    offset: int = 0,
+    with_abstracts: bool = False,
+) -> str:
+    """Search for academic works by title, abstract, or authors.
+    Uses FTS5 full-text search index for fast searching across 167M+ papers.
+    Supports FTS5 query syntax: AND, OR, NOT, "exact phrases".
+    Args:
+        query: Search query (e.g., "machine learning", "CRISPR", "neural network AND hippocampus")
+        limit: Maximum number of results to return (default: 10, max: 100)
+        offset: Skip first N results for pagination (default: 0)
+        with_abstracts: Include abstracts in results (default: False)
+    Returns:
+        JSON string with search results including total count and matching works.
+    Examples:
+        search_works("machine learning")
+        search_works("CRISPR", limit=20)
+        search_works("neural network AND memory", with_abstracts=True)
+    """
+    results = search(query, limit=min(limit, 100), offset=offset)
+    works_data = []
+    for work in results.works:
+        work_dict = {
+            "doi": work.doi,
+            "title": work.title,
+            "authors": work.authors,
+            "year": work.year,
+            "journal": work.journal,
+        }
+        if with_abstracts and work.abstract:
+            work_dict["abstract"] = work.abstract
+        works_data.append(work_dict)
+    return json.dumps(
+        {
+            "query": results.query,
+            "total": results.total,
+            "returned": len(works_data),
+            "elapsed_ms": round(results.elapsed_ms, 2),
+            "works": works_data,
+        },
+        indent=2,
+    )
+@mcp.tool()
+def get_work(doi: str, as_citation: bool = False) -> str:
+    """Get detailed information about a work by DOI.
+    Args:
+        doi: Digital Object Identifier (e.g., "10.1038/nature12373")
+        as_citation: Return formatted citation instead of full metadata
+    Returns:
+        JSON string with work metadata, or formatted citation string.
+    Examples:
+        get_work("10.1038/nature12373")
+        get_work("10.1126/science.aax0758", as_citation=True)
+    """
+    work = get(doi)
+    if work is None:
+        return json.dumps({"error": f"DOI not found: {doi}"})
+    if as_citation:
+        return work.citation()
+    return json.dumps(work.to_dict(), indent=2)
+@mcp.tool()
+def count_works(query: str) -> str:
+    """Count matching works without fetching results.
+    Faster than search when you only need the count.
+    Args:
+        query: FTS5 search query
+    Returns:
+        JSON string with count.
+    Examples:
+        count_works("CRISPR")
+        count_works("machine learning AND deep")
+    """
+    n = count(query)
+    return json.dumps({"query": query, "count": n})
+@mcp.tool()
+def database_info() -> str:
+    """Get database statistics and status.
+    Returns:
+        JSON string with database path, work count, FTS index count, and citation count.
+    """
+    db_info = info()
+    return json.dumps(db_info, indent=2)
+@mcp.tool()
+def calculate_impact_factor(
+    journal: str,
+    year: int = 2023,
+    window: int = 2,
+) -> str:
+    """Calculate impact factor for a journal.
+    Impact factor = citations in target year / articles in window years.
+    Args:
+        journal: Journal name or ISSN (e.g., "Nature", "Science", "0028-0836")
+        year: Target year for citation count (default: 2023)
+        window: Number of years for article window (default: 2 for standard IF)
+    Returns:
+        JSON string with journal name, article count, citation count, and impact factor.
+    Examples:
+        calculate_impact_factor("Nature")
+        calculate_impact_factor("Science", year=2022)
+        calculate_impact_factor("Cell", window=5)  # 5-year impact factor
+    """
+    try:
+        with ImpactFactorCalculator() as calc:
+            result = calc.calculate_impact_factor(
+                journal_identifier=journal,
+                target_year=year,
+                window_years=window,
+            )
+        return json.dumps(result, indent=2)
+    except Exception as e:
+        return json.dumps({"error": str(e)})
+def run_server(
+    transport: str = "stdio",
+    host: str = "localhost",
+    port: int = 8082,
+) -> None:
+    """Run the MCP server.
+    Args:
+        transport: Transport protocol ("stdio", "sse", or "http")
+        host: Host for HTTP/SSE transport
+        port: Port for HTTP/SSE transport
+    """
+    if transport == "stdio":
+        mcp.run(transport="stdio")
+    elif transport == "sse":
+        mcp.run(transport="sse", host=host, port=port)
+    elif transport == "http":
+        mcp.run(transport="streamable-http", host=host, port=port)
+    else:
+        raise ValueError(f"Unknown transport: {transport}")
+def main():
+    """Entry point for crossref-local-mcp command."""
+    run_server(transport="stdio")
+if __name__ == "__main__":
+    main()

crossref_local/models.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""Data models for crossref_local."""
+from dataclasses import dataclass, field
+from typing import List, Optional
+import json
+@dataclass
+class Work:
+    """
+    Represents a scholarly work from CrossRef.
+    Attributes:
+        doi: Digital Object Identifier
+        title: Work title
+        authors: List of author names
+        year: Publication year
+        journal: Journal/container title
+        issn: Journal ISSN
+        volume: Volume number
+        issue: Issue number
+        page: Page range
+        publisher: Publisher name
+        type: Work type (journal-article, book-chapter, etc.)
+        abstract: Abstract text (if available)
+        url: Resource URL
+        citation_count: Number of citations (if available)
+        references: List of reference DOIs
+    """
+    doi: str
+    title: Optional[str] = None
+    authors: List[str] = field(default_factory=list)
+    year: Optional[int] = None
+    journal: Optional[str] = None
+    issn: Optional[str] = None
+    volume: Optional[str] = None
+    issue: Optional[str] = None
+    page: Optional[str] = None
+    publisher: Optional[str] = None
+    type: Optional[str] = None
+    abstract: Optional[str] = None
+    url: Optional[str] = None
+    citation_count: Optional[int] = None
+    references: List[str] = field(default_factory=list)
+    @classmethod
+    def from_metadata(cls, doi: str, metadata: dict) -> "Work":
+        """
+        Create Work from CrossRef metadata JSON.
+        Args:
+            doi: DOI string
+            metadata: CrossRef metadata dictionary
+        Returns:
+            Work instance
+        """
+        # Extract authors
+        authors = []
+        for author in metadata.get("author", []):
+            given = author.get("given", "")
+            family = author.get("family", "")
+            if given and family:
+                authors.append(f"{given} {family}")
+            elif family:
+                authors.append(family)
+            elif author.get("name"):
+                authors.append(author["name"])
+        # Extract year from published date
+        year = None
+        published = metadata.get("published", {})
+        date_parts = published.get("date-parts", [[]])
+        if date_parts and date_parts[0]:
+            year = date_parts[0][0]
+        # Extract references
+        references = []
+        for ref in metadata.get("reference", []):
+            if ref.get("DOI"):
+                references.append(ref["DOI"])
+        # Container title (journal name)
+        container_titles = metadata.get("container-title", [])
+        journal = container_titles[0] if container_titles else None
+        # ISSN
+        issns = metadata.get("ISSN", [])
+        issn = issns[0] if issns else None
+        return cls(
+            doi=doi,
+            title=metadata.get("title", [None])[0] if metadata.get("title") else None,
+            authors=authors,
+            year=year,
+            journal=journal,
+            issn=issn,
+            volume=metadata.get("volume"),
+            issue=metadata.get("issue"),
+            page=metadata.get("page"),
+            publisher=metadata.get("publisher"),
+            type=metadata.get("type"),
+            abstract=metadata.get("abstract"),
+            url=metadata.get("URL"),
+            citation_count=metadata.get("is-referenced-by-count"),
+            references=references,
+        )
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return {
+            "doi": self.doi,
+            "title": self.title,
+            "authors": self.authors,
+            "year": self.year,
+            "journal": self.journal,
+            "issn": self.issn,
+            "volume": self.volume,
+            "issue": self.issue,
+            "page": self.page,
+            "publisher": self.publisher,
+            "type": self.type,
+            "abstract": self.abstract,
+            "url": self.url,
+            "citation_count": self.citation_count,
+            "references": self.references,
+        }
+    def citation(self, style: str = "apa") -> str:
+        """
+        Format as citation string.
+        Args:
+            style: Citation style (currently only "apa" supported)
+        Returns:
+            Formatted citation string
+        """
+        authors_str = ", ".join(self.authors[:3])
+        if len(self.authors) > 3:
+            authors_str += " et al."
+        year_str = f"({self.year})" if self.year else "(n.d.)"
+        title_str = self.title or "Untitled"
+        journal_str = f"*{self.journal}*" if self.journal else ""
+        parts = [authors_str, year_str, title_str]
+        if journal_str:
+            parts.append(journal_str)
+        if self.volume:
+            parts.append(f"{self.volume}")
+            if self.issue:
+                parts[-1] += f"({self.issue})"
+        if self.page:
+            parts.append(self.page)
+        parts.append(f"https://doi.org/{self.doi}")
+        return ". ".join(filter(None, parts))
+@dataclass
+class SearchResult:
+    """
+    Container for search results with metadata.
+    Attributes:
+        works: List of Work objects
+        total: Total number of matches
+        query: Original search query
+        elapsed_ms: Search time in milliseconds
+    """
+    works: List[Work]
+    total: int
+    query: str
+    elapsed_ms: float
+    def __len__(self) -> int:
+        return len(self.works)
+    def __iter__(self):
+        return iter(self.works)
+    def __getitem__(self, idx):
+        return self.works[idx]