PyPI - crossref-local - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crossref-local 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

crossref_local/__init__.py +18 -10
crossref_local/_aio/__init__.py +30 -0
crossref_local/_aio/_impl.py +238 -0
crossref_local/_cache/__init__.py +15 -0
crossref_local/{cache_export.py → _cache/export.py} +27 -10
crossref_local/_cache/utils.py +93 -0
crossref_local/_cli/__init__.py +9 -0
crossref_local/_cli/cli.py +512 -0
crossref_local/_cli/mcp.py +351 -0
crossref_local/_cli/mcp_server.py +413 -0
crossref_local/_core/__init__.py +58 -0
crossref_local/{api.py → _core/api.py} +24 -5
crossref_local/{citations.py → _core/citations.py} +55 -26
crossref_local/{config.py → _core/config.py} +40 -22
crossref_local/{db.py → _core/db.py} +32 -26
crossref_local/{fts.py → _core/fts.py} +18 -14
crossref_local/{models.py → _core/models.py} +11 -6
crossref_local/_remote/__init__.py +56 -0
crossref_local/_remote/base.py +356 -0
crossref_local/_remote/collections.py +175 -0
crossref_local/_server/__init__.py +140 -0
crossref_local/_server/middleware.py +25 -0
crossref_local/_server/models.py +129 -0
crossref_local/_server/routes_citations.py +98 -0
crossref_local/_server/routes_collections.py +282 -0
crossref_local/_server/routes_compat.py +102 -0
crossref_local/_server/routes_works.py +128 -0
crossref_local/_server/server.py +19 -0
crossref_local/aio.py +30 -206
crossref_local/cache.py +100 -100
crossref_local/cli.py +5 -515
crossref_local/jobs.py +169 -0
crossref_local/mcp_server.py +5 -410
crossref_local/remote.py +5 -266
crossref_local/server.py +5 -349
{crossref_local-0.4.0.dist-info → crossref_local-0.5.0.dist-info}/METADATA +36 -11
crossref_local-0.5.0.dist-info/RECORD +47 -0
{crossref_local-0.4.0.dist-info → crossref_local-0.5.0.dist-info}/entry_points.txt +1 -1
crossref_local/cli_mcp.py +0 -275
crossref_local-0.4.0.dist-info/RECORD +0 -27
/crossref_local/{cache_viz.py → _cache/viz.py} +0 -0
/crossref_local/{cli_cache.py → _cli/cache.py} +0 -0
/crossref_local/{cli_completion.py → _cli/completion.py} +0 -0
/crossref_local/{cli_main.py → _cli/main.py} +0 -0
/crossref_local/{impact_factor → _impact_factor}/__init__.py +0 -0
/crossref_local/{impact_factor → _impact_factor}/calculator.py +0 -0
/crossref_local/{impact_factor → _impact_factor}/journal_lookup.py +0 -0
{crossref_local-0.4.0.dist-info → crossref_local-0.5.0.dist-info}/WHEEL +0 -0

crossref_local/_remote/base.py ADDED Viewed

@@ -0,0 +1,356 @@
+"""Remote API client for crossref_local.
+Connects to a CrossRef Local API server instead of direct database access.
+Use this when the database is on a remote server accessible via HTTP.
+"""
+import json
+import urllib.request
+import urllib.parse
+import urllib.error
+from typing import List, Optional, Dict, Any
+from .._core.models import Work, SearchResult
+from .._core.config import DEFAULT_PORT
+# Default URL uses SCITEX port convention
+DEFAULT_API_URL = f"http://localhost:{DEFAULT_PORT}"
+class RemoteClient:
+    """
+    HTTP client for CrossRef Local API server.
+    Provides the same interface as the local API but connects
+    to a remote server via HTTP.
+    Example:
+        >>> client = RemoteClient("http://localhost:31291")
+        >>> results = client.search(title="machine learning", limit=10)
+        >>> work = client.get("10.1038/nature12373")
+    """
+    def __init__(self, base_url: str = DEFAULT_API_URL, timeout: int = 30):
+        """
+        Initialize remote client.
+        Args:
+            base_url: API server URL (default: http://localhost:3333)
+            timeout: Request timeout in seconds
+        """
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+    def _request(
+        self,
+        endpoint: str,
+        params: Optional[Dict[str, Any]] = None,
+        method: str = "GET",
+        data: Optional[Dict[str, Any]] = None,
+    ) -> Dict:
+        """Make HTTP request to API."""
+        url = f"{self.base_url}{endpoint}"
+        if params:
+            # Filter out None values
+            params = {k: v for k, v in params.items() if v is not None}
+            if params:
+                url = f"{url}?{urllib.parse.urlencode(params)}"
+        try:
+            req_data = None
+            if data is not None:
+                req_data = json.dumps(data).encode("utf-8")
+            req = urllib.request.Request(url, data=req_data, method=method)
+            req.add_header("Accept", "application/json")
+            if req_data:
+                req.add_header("Content-Type", "application/json")
+            with urllib.request.urlopen(req, timeout=self.timeout) as response:
+                return json.loads(response.read().decode("utf-8"))
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                return None
+            raise ConnectionError(f"API request failed: {e.code} {e.reason}") from e
+        except urllib.error.URLError as e:
+            raise ConnectionError(
+                f"Cannot connect to API at {self.base_url}: {e.reason}"
+            ) from e
+    def health(self) -> Dict:
+        """Check API server health."""
+        return self._request("/health")
+    def info(self) -> Dict:
+        """Get database/API information."""
+        root = self._request("/")
+        info_data = self._request("/info")
+        return {
+            "api_url": self.base_url,
+            "api_version": root.get("version", "unknown"),
+            "status": root.get("status", "unknown"),
+            "mode": "remote",
+            "works": info_data.get("total_papers", 0) if info_data else 0,
+            "fts_indexed": info_data.get("fts_indexed", 0) if info_data else 0,
+            "citations": info_data.get("citations", 0) if info_data else 0,
+        }
+    def search(
+        self,
+        query: Optional[str] = None,
+        doi: Optional[str] = None,
+        title: Optional[str] = None,
+        authors: Optional[str] = None,
+        year: Optional[int] = None,
+        limit: int = 10,
+        offset: int = 0,
+    ) -> SearchResult:
+        """
+        Search for papers.
+        Args:
+            query: Full-text search query (searches title by default)
+            doi: Search by DOI
+            title: Search by title (explicit)
+            authors: Search by author name
+            year: Filter by publication year
+            limit: Maximum results (default: 10, max: 100)
+            offset: Skip first N results for pagination
+        Returns:
+            SearchResult with matching works
+        """
+        # Use new /works endpoint with FTS5 search
+        search_query = query or title
+        params = {
+            "q": search_query,
+            "limit": min(limit, 100),
+            "offset": offset,
+        }
+        data = self._request("/works", params)
+        if not data:
+            return SearchResult(works=[], total=0, query=query or "", elapsed_ms=0.0)
+        works = []
+        for item in data.get("results", []):
+            work = Work(
+                doi=item.get("doi", ""),
+                title=item.get("title", ""),
+                authors=item.get("authors", []),
+                year=item.get("year"),
+                journal=item.get("journal"),
+                volume=item.get("volume"),
+                issue=item.get("issue"),
+                page=item.get("page") or item.get("pages"),
+                abstract=item.get("abstract"),
+                citation_count=item.get("citation_count"),
+            )
+            works.append(work)
+        return SearchResult(
+            works=works,
+            total=data.get("total", len(works)),
+            query=query or title or doi or "",
+            elapsed_ms=data.get("elapsed_ms", 0.0),
+        )
+    def get(self, doi: str) -> Optional[Work]:
+        """
+        Get a work by DOI.
+        Args:
+            doi: Digital Object Identifier
+        Returns:
+            Work object or None if not found
+        """
+        # Use /works/{doi} endpoint directly
+        data = self._request(f"/works/{doi}")
+        if not data or "error" in data:
+            return None
+        return Work(
+            doi=data.get("doi", doi),
+            title=data.get("title", ""),
+            authors=data.get("authors", []),
+            year=data.get("year"),
+            journal=data.get("journal"),
+            volume=data.get("volume"),
+            issue=data.get("issue"),
+            page=data.get("page"),
+            abstract=data.get("abstract"),
+            citation_count=data.get("citation_count"),
+        )
+    def get_many(self, dois: List[str]) -> List[Work]:
+        """
+        Get multiple works by DOI using batch endpoint.
+        Args:
+            dois: List of DOIs
+        Returns:
+            List of Work objects
+        """
+        # Use batch endpoint if available
+        try:
+            data = {"dois": dois}
+            req_data = json.dumps(data).encode("utf-8")
+            req = urllib.request.Request(
+                f"{self.base_url}/works/batch", data=req_data, method="POST"
+            )
+            req.add_header("Content-Type", "application/json")
+            req.add_header("Accept", "application/json")
+            with urllib.request.urlopen(req, timeout=self.timeout) as response:
+                result = json.loads(response.read().decode("utf-8"))
+            works = []
+            for item in result.get("results", []):
+                work = Work(
+                    doi=item.get("doi", ""),
+                    title=item.get("title", ""),
+                    authors=item.get("authors", []),
+                    year=item.get("year"),
+                    journal=item.get("journal"),
+                    volume=item.get("volume"),
+                    issue=item.get("issue"),
+                    page=item.get("page"),
+                    abstract=item.get("abstract"),
+                    citation_count=item.get("citation_count"),
+                )
+                works.append(work)
+            return works
+        except Exception:
+            # Fallback to individual lookups
+            works = []
+            for doi in dois:
+                work = self.get(doi)
+                if work:
+                    works.append(work)
+            return works
+    def exists(self, doi: str) -> bool:
+        """Check if a DOI exists."""
+        return self.get(doi) is not None
+    def get_citations(self, doi: str, direction: str = "both") -> Dict:
+        """
+        Get citations for a paper (legacy endpoint).
+        Args:
+            doi: Paper DOI
+            direction: 'citing', 'cited_by', or 'both'
+        Returns:
+            Dict with citation information
+        """
+        params = {"doi": doi, "direction": direction}
+        return self._request("/api/citations/", params) or {}
+    def get_citing(self, doi: str, limit: int = 100) -> List[str]:
+        """
+        Get DOIs of papers that cite the given DOI.
+        Args:
+            doi: The DOI to find citations for
+            limit: Maximum number of citing papers to return
+        Returns:
+            List of DOIs that cite this paper
+        """
+        data = self._request(f"/citations/{doi}/citing", {"limit": limit})
+        if not data:
+            return []
+        return data.get("papers", [])
+    def get_cited(self, doi: str, limit: int = 100) -> List[str]:
+        """
+        Get DOIs of papers that the given DOI cites (references).
+        Args:
+            doi: The DOI to find references for
+            limit: Maximum number of referenced papers to return
+        Returns:
+            List of DOIs that this paper cites
+        """
+        data = self._request(f"/citations/{doi}/cited", {"limit": limit})
+        if not data:
+            return []
+        return data.get("papers", [])
+    def get_citation_count(self, doi: str) -> int:
+        """
+        Get the number of citations for a DOI.
+        Args:
+            doi: The DOI to count citations for
+        Returns:
+            Number of papers citing this DOI
+        """
+        data = self._request(f"/citations/{doi}/count")
+        if not data:
+            return 0
+        return data.get("citation_count", 0)
+    def get_citation_network(
+        self, doi: str, depth: int = 1, max_citing: int = 25, max_cited: int = 25
+    ) -> Dict:
+        """
+        Get citation network graph for a DOI.
+        Args:
+            doi: The DOI to build the network around
+            depth: How many levels of citations to include (1-3)
+            max_citing: Max papers citing each node to include
+            max_cited: Max papers each node cites to include
+        Returns:
+            Dict with nodes, edges, and stats
+        """
+        params = {
+            "depth": depth,
+            "max_citing": max_citing,
+            "max_cited": max_cited,
+        }
+        data = self._request(f"/citations/{doi}/network", params)
+        return data or {}
+    def get_journal(
+        self, issn: Optional[str] = None, name: Optional[str] = None
+    ) -> Dict:
+        """
+        Get journal information.
+        Args:
+            issn: Journal ISSN
+            name: Journal name
+        Returns:
+            Dict with journal information
+        """
+        params = {"issn": issn, "name": name}
+        return self._request("/api/journal/", params) or {}
+# Module-level client for convenience
+_client: Optional[RemoteClient] = None
+def get_client(base_url: str = DEFAULT_API_URL) -> RemoteClient:
+    """Get or create singleton remote client."""
+    global _client
+    if _client is None or _client.base_url != base_url:
+        _client = RemoteClient(base_url)
+    return _client
+def reset_client() -> None:
+    """Reset singleton client."""
+    global _client
+    _client = None

crossref_local/_remote/collections.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""Collection methods mixin for RemoteClient."""
+import json
+import urllib.request
+import urllib.parse
+import urllib.error
+from typing import Dict, List, Optional, Any
+class CollectionsMixin:
+    """Mixin providing collection management methods for RemoteClient."""
+    def list_collections(self) -> List[Dict]:
+        """
+        List all collections.
+        Returns:
+            List of collection info dictionaries
+        """
+        data = self._request("/collections")
+        if not data:
+            return []
+        return data.get("collections", [])
+    def create_collection(
+        self,
+        name: str,
+        query: Optional[str] = None,
+        dois: Optional[List[str]] = None,
+        limit: int = 1000,
+    ) -> Dict:
+        """
+        Create a new collection from search query or DOI list.
+        Args:
+            name: Collection name
+            query: FTS search query (if dois not provided)
+            dois: Explicit list of DOIs
+            limit: Max papers for query mode
+        Returns:
+            Collection info dictionary
+        """
+        body = {"name": name, "limit": limit}
+        if query:
+            body["query"] = query
+        if dois:
+            body["dois"] = dois
+        result = self._request("/collections", method="POST", data=body)
+        return result or {}
+    def get_collection(
+        self,
+        name: str,
+        fields: Optional[List[str]] = None,
+        include_abstract: bool = False,
+        include_references: bool = False,
+        include_citations: bool = False,
+        year_min: Optional[int] = None,
+        year_max: Optional[int] = None,
+        journal: Optional[str] = None,
+        limit: Optional[int] = None,
+    ) -> Dict:
+        """
+        Query a collection with field filtering.
+        Args:
+            name: Collection name
+            fields: Explicit field list
+            include_abstract: Include abstracts
+            include_references: Include references
+            include_citations: Include citation counts
+            year_min: Filter by min year
+            year_max: Filter by max year
+            journal: Filter by journal
+            limit: Max results
+        Returns:
+            Dict with collection name, count, and papers
+        """
+        params = {
+            "include_abstract": include_abstract,
+            "include_references": include_references,
+            "include_citations": include_citations,
+            "year_min": year_min,
+            "year_max": year_max,
+            "journal": journal,
+            "limit": limit,
+        }
+        if fields:
+            params["fields"] = ",".join(fields)
+        data = self._request(f"/collections/{name}", params)
+        return data or {}
+    def get_collection_stats(self, name: str) -> Dict:
+        """
+        Get collection statistics.
+        Args:
+            name: Collection name
+        Returns:
+            Dict with year distribution, top journals, citation stats
+        """
+        data = self._request(f"/collections/{name}/stats")
+        return data or {}
+    def download_collection(
+        self,
+        name: str,
+        output_path: str,
+        format: str = "json",
+        fields: Optional[List[str]] = None,
+    ) -> str:
+        """
+        Download collection as a file.
+        Args:
+            name: Collection name
+            output_path: Local file path to save to
+            format: Export format (json, csv, bibtex, dois)
+            fields: Fields to include (json/csv)
+        Returns:
+            Output file path
+        """
+        params = {"format": format}
+        if fields:
+            params["fields"] = ",".join(fields)
+        url = f"{self.base_url}/collections/{name}/download"
+        if params:
+            url = f"{url}?{urllib.parse.urlencode(params)}"
+        try:
+            req = urllib.request.Request(url)
+            with urllib.request.urlopen(req, timeout=self.timeout) as response:
+                content = response.read()
+                with open(output_path, "wb") as f:
+                    f.write(content)
+            return output_path
+        except urllib.error.HTTPError as e:
+            raise ConnectionError(f"Download failed: {e.code} {e.reason}") from e
+        except urllib.error.URLError as e:
+            raise ConnectionError(f"Cannot connect: {e.reason}") from e
+    def delete_collection(self, name: str) -> bool:
+        """
+        Delete a collection.
+        Args:
+            name: Collection name
+        Returns:
+            True if deleted
+        """
+        data = self._request(f"/collections/{name}", method="DELETE")
+        if not data:
+            return False
+        return data.get("deleted", False)
+    def collection_exists(self, name: str) -> bool:
+        """
+        Check if a collection exists.
+        Args:
+            name: Collection name
+        Returns:
+            True if exists
+        """
+        data = self._request(f"/collections/{name}/stats")
+        return data is not None

crossref_local/_server/__init__.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""FastAPI server for CrossRef Local with FTS5 search.
+Modular server structure:
+- routes_works.py: /works endpoints
+- routes_citations.py: /citations endpoints
+- routes_collections.py: /collections endpoints
+- routes_compat.py: Legacy /api/* endpoints
+- models.py: Pydantic response models
+- middleware.py: Request middleware
+"""
+import os
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from .. import __version__
+from .middleware import UserContextMiddleware
+from .routes_works import router as works_router
+from .routes_citations import router as citations_router
+from .routes_collections import router as collections_router
+from .routes_compat import router as compat_router
+# Create FastAPI app
+app = FastAPI(
+    title="CrossRef Local API",
+    description="Fast full-text search across 167M+ scholarly works",
+    version=__version__,
+)
+# Middleware
+app.add_middleware(UserContextMiddleware)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Include routers
+app.include_router(works_router)
+app.include_router(citations_router)
+app.include_router(collections_router)
+app.include_router(compat_router)
+@app.get("/")
+def root():
+    """API root with endpoint information."""
+    return {
+        "name": "CrossRef Local API",
+        "version": __version__,
+        "status": "running",
+        "endpoints": {
+            "health": "/health",
+            "info": "/info",
+            "search": "/works?q=<query>",
+            "get_by_doi": "/works/{doi}",
+            "batch": "/works/batch",
+            "citations_citing": "/citations/{doi}/citing",
+            "citations_cited": "/citations/{doi}/cited",
+            "citations_count": "/citations/{doi}/count",
+            "citations_network": "/citations/{doi}/network",
+            "collections_list": "/collections",
+            "collections_create": "/collections (POST)",
+            "collections_get": "/collections/{name}",
+            "collections_stats": "/collections/{name}/stats",
+            "collections_download": "/collections/{name}/download",
+            "collections_delete": "/collections/{name} (DELETE)",
+        },
+    }
+@app.get("/health")
+def health():
+    """Health check endpoint."""
+    from .._core.db import get_db
+    db = get_db()
+    return {
+        "status": "healthy",
+        "database_connected": db is not None,
+        "database_path": str(db.db_path) if db else None,
+    }
+@app.get("/info")
+def info():
+    """Get database statistics."""
+    from .._core.db import get_db
+    from .models import InfoResponse
+    db = get_db()
+    row = db.fetchone("SELECT COUNT(*) as count FROM works")
+    work_count = row["count"] if row else 0
+    try:
+        row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
+        fts_count = row["count"] if row else 0
+    except Exception:
+        fts_count = 0
+    try:
+        row = db.fetchone("SELECT COUNT(*) as count FROM citations")
+        citation_count = row["count"] if row else 0
+    except Exception:
+        citation_count = 0
+    return InfoResponse(
+        total_papers=work_count,
+        fts_indexed=fts_count,
+        citations=citation_count,
+        database_path=str(db.db_path),
+    )
+# Default port: SCITEX convention (3129X scheme)
+DEFAULT_PORT = int(
+    os.environ.get(
+        "SCITEX_SCHOLAR_CROSSREF_PORT",
+        os.environ.get("CROSSREF_LOCAL_PORT", "31291"),
+    )
+)
+DEFAULT_HOST = os.environ.get(
+    "SCITEX_SCHOLAR_CROSSREF_HOST",
+    os.environ.get("CROSSREF_LOCAL_HOST", "0.0.0.0"),
+)
+def run_server(host: str = None, port: int = None):
+    """Run the FastAPI server."""
+    import uvicorn
+    host = host or DEFAULT_HOST
+    port = port or DEFAULT_PORT
+    uvicorn.run(app, host=host, port=port)
+__all__ = ["app", "run_server", "DEFAULT_PORT", "DEFAULT_HOST"]

crossref_local/_server/middleware.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Request middleware for CrossRef Local API."""
+from fastapi import Request
+from starlette.middleware.base import BaseHTTPMiddleware
+class UserContextMiddleware(BaseHTTPMiddleware):
+    """Extract X-User-ID header for multi-tenant collection scoping.
+    When requests come through scitex-cloud gateway, it passes the
+    authenticated user's ID via X-User-ID header. This middleware
+    extracts it and makes it available via request.state.user_id.
+    Usage in endpoints:
+        @app.get("/collections")
+        def list_collections(request: Request):
+            user_id = request.state.user_id  # None for local, set for cloud
+            ...
+    """
+    async def dispatch(self, request: Request, call_next):
+        # Extract user ID from header (passed by scitex-cloud gateway)
+        request.state.user_id = request.headers.get("X-User-ID")
+        response = await call_next(request)
+        return response

crossref-local 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crossref-local 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl