PyPI - crossref-local - Versions diffs - 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crossref-local 0.3.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

crossref_local/__init__.py +38 -16
crossref_local/__main__.py +0 -0
crossref_local/_aio/__init__.py +30 -0
crossref_local/_aio/_impl.py +238 -0
crossref_local/_cache/__init__.py +15 -0
crossref_local/_cache/export.py +100 -0
crossref_local/_cache/utils.py +93 -0
crossref_local/_cache/viz.py +296 -0
crossref_local/_cli/__init__.py +9 -0
crossref_local/_cli/cache.py +179 -0
crossref_local/_cli/cli.py +512 -0
crossref_local/_cli/completion.py +245 -0
crossref_local/_cli/main.py +20 -0
crossref_local/_cli/mcp.py +351 -0
crossref_local/_cli/mcp_server.py +413 -0
crossref_local/_core/__init__.py +58 -0
crossref_local/{api.py → _core/api.py} +130 -36
crossref_local/{citations.py → _core/citations.py} +55 -26
crossref_local/{config.py → _core/config.py} +57 -42
crossref_local/{db.py → _core/db.py} +32 -26
crossref_local/{fts.py → _core/fts.py} +18 -14
crossref_local/{models.py → _core/models.py} +11 -6
crossref_local/{impact_factor → _impact_factor}/__init__.py +0 -0
crossref_local/{impact_factor → _impact_factor}/calculator.py +0 -0
crossref_local/{impact_factor → _impact_factor}/journal_lookup.py +0 -0
crossref_local/_remote/__init__.py +56 -0
crossref_local/_remote/base.py +356 -0
crossref_local/_remote/collections.py +175 -0
crossref_local/_server/__init__.py +140 -0
crossref_local/_server/middleware.py +25 -0
crossref_local/_server/models.py +129 -0
crossref_local/_server/routes_citations.py +98 -0
crossref_local/_server/routes_collections.py +282 -0
crossref_local/_server/routes_compat.py +102 -0
crossref_local/_server/routes_works.py +128 -0
crossref_local/_server/server.py +19 -0
crossref_local/aio.py +30 -206
crossref_local/cache.py +466 -0
crossref_local/cli.py +5 -447
crossref_local/jobs.py +169 -0
crossref_local/mcp_server.py +5 -199
crossref_local/remote.py +5 -261
crossref_local/server.py +5 -349
{crossref_local-0.3.1.dist-info → crossref_local-0.5.0.dist-info}/METADATA +88 -24
crossref_local-0.5.0.dist-info/RECORD +47 -0
crossref_local-0.3.1.dist-info/RECORD +0 -20
{crossref_local-0.3.1.dist-info → crossref_local-0.5.0.dist-info}/WHEEL +0 -0
{crossref_local-0.3.1.dist-info → crossref_local-0.5.0.dist-info}/entry_points.txt +0 -0

crossref_local/aio.py CHANGED Viewed

@@ -1,5 +1,5 @@
-"""
-Async API for crossref_local.
+#!/usr/bin/env python3
+"""Async API module for crossref_local.
 Provides async versions of all API functions. Uses thread pool execution
 with per-thread database connections for thread safety.
@@ -19,210 +19,30 @@ Usage:
     counts = await aio.count_many(["CRISPR", "machine learning"])
 """
-import asyncio
-import threading
-from typing import List, Optional
-from .models import Work, SearchResult
-from .config import Config
-from .db import Database
-# Thread-local storage for database connections
-_thread_local = threading.local()
-def _get_thread_db() -> Database:
-    """Get thread-local database connection."""
-    if not hasattr(_thread_local, 'db'):
-        _thread_local.db = Database(Config.get_db_path())
-    return _thread_local.db
-def _search_sync(query: str, limit: int, offset: int) -> SearchResult:
-    """Thread-safe sync search."""
-    from . import fts
-    # Use thread-local DB
-    db = _get_thread_db()
-    return fts._search_with_db(db, query, limit, offset)
-def _count_sync(query: str) -> int:
-    """Thread-safe sync count."""
-    from . import fts
-    db = _get_thread_db()
-    return fts._count_with_db(db, query)
-def _get_sync(doi: str) -> Optional[Work]:
-    """Thread-safe sync get."""
-    db = _get_thread_db()
-    metadata = db.get_metadata(doi)
-    if metadata:
-        return Work.from_metadata(doi, metadata)
-    return None
-def _get_many_sync(dois: List[str]) -> List[Work]:
-    """Thread-safe sync get_many."""
-    db = _get_thread_db()
-    works = []
-    for doi in dois:
-        metadata = db.get_metadata(doi)
-        if metadata:
-            works.append(Work.from_metadata(doi, metadata))
-    return works
-def _exists_sync(doi: str) -> bool:
-    """Thread-safe sync exists."""
-    db = _get_thread_db()
-    row = db.fetchone("SELECT 1 FROM works WHERE doi = ?", (doi,))
-    return row is not None
-def _info_sync() -> dict:
-    """Thread-safe sync info."""
-    db = _get_thread_db()
-    row = db.fetchone("SELECT COUNT(*) as count FROM works")
-    work_count = row["count"] if row else 0
-    try:
-        row = db.fetchone("SELECT COUNT(*) as count FROM works_fts")
-        fts_count = row["count"] if row else 0
-    except Exception:
-        fts_count = 0
-    try:
-        row = db.fetchone("SELECT COUNT(*) as count FROM citations")
-        citation_count = row["count"] if row else 0
-    except Exception:
-        citation_count = 0
-    return {
-        "db_path": str(Config.get_db_path()),
-        "works": work_count,
-        "fts_indexed": fts_count,
-        "citations": citation_count,
-    }
-async def search(
-    query: str,
-    limit: int = 10,
-    offset: int = 0,
-) -> SearchResult:
-    """
-    Async full-text search across works.
-    Args:
-        query: Search query (supports FTS5 syntax)
-        limit: Maximum results to return
-        offset: Skip first N results (for pagination)
-    Returns:
-        SearchResult with matching works
-    """
-    return await asyncio.to_thread(_search_sync, query, limit, offset)
-async def count(query: str) -> int:
-    """
-    Async count matching works without fetching results.
-    Args:
-        query: FTS5 search query
-    Returns:
-        Number of matching works
-    """
-    return await asyncio.to_thread(_count_sync, query)
-async def get(doi: str) -> Optional[Work]:
-    """
-    Async get a work by DOI.
-    Args:
-        doi: Digital Object Identifier
-    Returns:
-        Work object or None if not found
-    """
-    return await asyncio.to_thread(_get_sync, doi)
-async def get_many(dois: List[str]) -> List[Work]:
-    """
-    Async get multiple works by DOI.
-    Args:
-        dois: List of DOIs
-    Returns:
-        List of Work objects (missing DOIs are skipped)
-    """
-    return await asyncio.to_thread(_get_many_sync, dois)
-async def exists(doi: str) -> bool:
-    """
-    Async check if a DOI exists in the database.
-    Args:
-        doi: Digital Object Identifier
-    Returns:
-        True if DOI exists
-    """
-    return await asyncio.to_thread(_exists_sync, doi)
-async def info() -> dict:
-    """
-    Async get database information.
-    Returns:
-        Dictionary with database stats
-    """
-    return await asyncio.to_thread(_info_sync)
-async def search_many(queries: List[str], limit: int = 10) -> List[SearchResult]:
-    """
-    Run multiple searches concurrently.
-    Args:
-        queries: List of search queries
-        limit: Maximum results per query
-    Returns:
-        List of SearchResult objects
-    """
-    tasks = [search(q, limit=limit) for q in queries]
-    return await asyncio.gather(*tasks)
-async def count_many(queries: List[str]) -> dict:
-    """
-    Count matches for multiple queries concurrently.
-    Args:
-        queries: List of search queries
-    Returns:
-        Dict mapping query -> count
-    Example:
-        >>> counts = await count_many(["CRISPR", "machine learning"])
-        >>> print(counts)
-        {'CRISPR': 45000, 'machine learning': 477922}
-    """
-    tasks = [count(q) for q in queries]
-    results = await asyncio.gather(*tasks)
-    return dict(zip(queries, results))
+from ._aio import (
+    SearchResult as _SearchResult,
+    Work as _Work,
+    count as _count,
+    count_many as _count_many,
+    exists as _exists,
+    get as _get,
+    get_many as _get_many,
+    info as _info,
+    search as _search,
+    search_many as _search_many,
+)
+# Re-export with clean names
+search = _search
+count = _count
+get = _get
+get_many = _get_many
+exists = _exists
+info = _info
+search_many = _search_many
+count_many = _count_many
+SearchResult = _SearchResult
+Work = _Work
 __all__ = [
     "search",
@@ -233,4 +53,8 @@ __all__ = [
     "info",
     "search_many",
     "count_many",
+    "SearchResult",
+    "Work",
 ]
+# EOF

crossref_local/cache.py ADDED Viewed

@@ -0,0 +1,466 @@
+"""Cache module for crossref-local.
+Provides disk-based caching of paper metadata to reduce context usage
+and enable efficient re-querying with field filtering.
+Architecture:
+    1. FTS search -> DOIs (fast, minimal)
+    2. Cache DOIs -> full metadata saved to disk
+    3. Query cache -> filtered fields based on need
+Usage:
+    >>> from crossref_local import cache
+    >>> # Create cache from search
+    >>> cache.create("epilepsy", query="epilepsy seizure prediction", limit=100)
+    >>> # Query with minimal fields
+    >>> papers = cache.query("epilepsy", fields=["doi", "title", "year"])
+    >>> # Get statistics
+    >>> stats = cache.stats("epilepsy")
+"""
+import json as _json
+import time as _time
+from dataclasses import dataclass as _dataclass
+from typing import Any as _Any
+from typing import Dict as _Dict
+from typing import List as _List
+from typing import Optional as _Optional
+from ._core.api import get_many as _get_many
+from ._core.api import search as _search
+from ._cache.utils import cache_path as _cache_path
+from ._cache.utils import get_cache_dir as _get_cache_dir
+from ._cache.utils import meta_path as _meta_path
+__all__ = [
+    "CacheInfo",
+    "create",
+    "append",
+    "load",
+    "query",
+    "query_dois",
+    "stats",
+    "info",
+    "exists",
+    "list_caches",
+    "delete",
+    "export",
+]
+@_dataclass
+class CacheInfo:
+    """Information about a cache."""
+    name: str
+    path: str
+    size_bytes: int
+    paper_count: int
+    created_at: str
+    query: _Optional[str] = None
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "path": self.path,
+            "size_bytes": self.size_bytes,
+            "size_mb": round(self.size_bytes / 1024 / 1024, 2),
+            "paper_count": self.paper_count,
+            "created_at": self.created_at,
+            "query": self.query,
+        }
+def create(
+    name: str,
+    query: _Optional[str] = None,
+    dois: _Optional[_List[str]] = None,
+    papers: _Optional[_List[_Dict[str, _Any]]] = None,
+    limit: int = 1000,
+    offset: int = 0,
+    user_id: _Optional[str] = None,
+) -> CacheInfo:
+    """Create a cache from search query, DOI list, or pre-fetched papers.
+    Args:
+        name: Cache name (used as filename)
+        query: FTS search query (if dois/papers not provided)
+        dois: Explicit list of DOIs to cache
+        papers: Pre-fetched paper dicts (skips API calls)
+        limit: Max papers to fetch (for query mode)
+        offset: Offset for pagination (for query mode)
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        CacheInfo with cache details
+    Example:
+        >>> create("epilepsy", query="epilepsy seizure", limit=500)
+        >>> create("my_papers", dois=["10.1038/nature12373", ...])
+        >>> create("imported", papers=[{"doi": "...", "title": "..."}])
+    """
+    if papers is not None:
+        # Use pre-fetched papers directly
+        pass
+    elif dois is None and query is None:
+        raise ValueError("Must provide 'query', 'dois', or 'papers'")
+    elif dois is None:
+        # Get DOIs from search
+        results = _search(query, limit=limit, offset=offset)
+        dois = [w.doi for w in results.works]
+        # Fetch full metadata
+        works = _get_many(dois)
+        papers = [w.to_dict() for w in works]
+    else:
+        # Fetch full metadata for DOIs
+        works = _get_many(dois)
+        papers = [w.to_dict() for w in works]
+    # Save cache
+    cache_file = _cache_path(name, user_id)
+    with open(cache_file, "w") as f:
+        _json.dump(papers, f)
+    # Save metadata
+    meta = {
+        "name": name,
+        "query": query,
+        "created_at": _time.strftime("%Y-%m-%d %H:%M:%S"),
+        "paper_count": len(papers),
+        "dois_requested": len(dois) if dois else len(papers),
+    }
+    with open(_meta_path(name, user_id), "w") as f:
+        _json.dump(meta, f, indent=2)
+    return CacheInfo(
+        name=name,
+        path=str(cache_file),
+        size_bytes=cache_file.stat().st_size,
+        paper_count=len(papers),
+        created_at=meta["created_at"],
+        query=query,
+    )
+def append(
+    name: str,
+    query: _Optional[str] = None,
+    dois: _Optional[_List[str]] = None,
+    limit: int = 1000,
+    offset: int = 0,
+    user_id: _Optional[str] = None,
+) -> CacheInfo:
+    """Append papers to existing cache.
+    Args:
+        name: Existing cache name
+        query: FTS search query (if dois not provided)
+        dois: Explicit list of DOIs to add
+        limit: Max papers to fetch (for query mode)
+        offset: Offset for pagination (for query mode)
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        Updated CacheInfo
+    """
+    if not exists(name, user_id=user_id):
+        return create(
+            name, query=query, dois=dois, limit=limit, offset=offset, user_id=user_id
+        )
+    # Load existing
+    existing = load(name, user_id=user_id)
+    existing_dois = {p["doi"] for p in existing}
+    # Get new DOIs
+    if dois is None and query is not None:
+        results = _search(query, limit=limit, offset=offset)
+        dois = [w.doi for w in results.works]
+    elif dois is None:
+        raise ValueError("Must provide either 'query' or 'dois'")
+    # Filter out already cached
+    new_dois = [d for d in dois if d not in existing_dois]
+    if new_dois:
+        # Fetch new metadata
+        new_works = _get_many(new_dois)
+        new_papers = [w.to_dict() for w in new_works]
+        # Combine and save
+        all_papers = existing + new_papers
+        cache_file = _cache_path(name, user_id)
+        with open(cache_file, "w") as f:
+            _json.dump(all_papers, f)
+        # Update metadata
+        meta_file = _meta_path(name, user_id)
+        if meta_file.exists():
+            with open(meta_file) as f:
+                meta = _json.load(f)
+        else:
+            meta = {"name": name}
+        meta["updated_at"] = _time.strftime("%Y-%m-%d %H:%M:%S")
+        meta["paper_count"] = len(all_papers)
+        with open(meta_file, "w") as f:
+            _json.dump(meta, f, indent=2)
+        return info(name, user_id=user_id)
+    return info(name, user_id=user_id)
+def load(name: str, user_id: _Optional[str] = None) -> _List[_Dict[str, _Any]]:
+    """Load raw cache data.
+    Args:
+        name: Cache name
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        List of paper dictionaries with full metadata
+    """
+    cache_file = _cache_path(name, user_id)
+    if not cache_file.exists():
+        raise FileNotFoundError(f"Cache not found: {name}")
+    with open(cache_file) as f:
+        return _json.load(f)
+def query(
+    name: str,
+    fields: _Optional[_List[str]] = None,
+    include_abstract: bool = False,
+    include_references: bool = False,
+    include_citations: bool = False,
+    year_min: _Optional[int] = None,
+    year_max: _Optional[int] = None,
+    journal: _Optional[str] = None,
+    limit: _Optional[int] = None,
+    user_id: _Optional[str] = None,
+) -> _List[_Dict[str, _Any]]:
+    """Query cache with field filtering.
+    Args:
+        name: Cache name
+        fields: Explicit field list (overrides include_* flags)
+        include_abstract: Include abstract field
+        include_references: Include references list
+        include_citations: Include citation_count
+        year_min: Filter by minimum year
+        year_max: Filter by maximum year
+        journal: Filter by journal name (substring match)
+        limit: Max results to return
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        Filtered list of paper dictionaries
+    Example:
+        >>> # Minimal query
+        >>> papers = query("epilepsy", fields=["doi", "title", "year"])
+        >>> # With filters
+        >>> papers = query("epilepsy", year_min=2020, include_citations=True)
+    """
+    papers = load(name, user_id=user_id)
+    # Apply filters
+    if year_min is not None:
+        papers = [p for p in papers if p.get("year") and p["year"] >= year_min]
+    if year_max is not None:
+        papers = [p for p in papers if p.get("year") and p["year"] <= year_max]
+    if journal is not None:
+        journal_lower = journal.lower()
+        papers = [
+            p
+            for p in papers
+            if p.get("journal") and journal_lower in p["journal"].lower()
+        ]
+    # Apply limit
+    if limit is not None:
+        papers = papers[:limit]
+    # Field projection
+    if fields is not None:
+        # Explicit field list
+        papers = [{k: p.get(k) for k in fields if k in p} for p in papers]
+    else:
+        # Build field list from flags
+        base_fields = {"doi", "title", "authors", "year", "journal"}
+        if include_abstract:
+            base_fields.add("abstract")
+        if include_references:
+            base_fields.add("references")
+        if include_citations:
+            base_fields.add("citation_count")
+        papers = [{k: p.get(k) for k in base_fields if k in p} for p in papers]
+    return papers
+def query_dois(name: str, user_id: _Optional[str] = None) -> _List[str]:
+    """Get just DOIs from cache.
+    Args:
+        name: Cache name
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        List of DOIs
+    """
+    papers = load(name, user_id=user_id)
+    return [p["doi"] for p in papers if p.get("doi")]
+def stats(name: str, user_id: _Optional[str] = None) -> _Dict[str, _Any]:
+    """Get cache statistics.
+    Args:
+        name: Cache name
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        Dictionary with statistics
+    """
+    papers = load(name, user_id=user_id)
+    # Year distribution
+    years = [p.get("year") for p in papers if p.get("year")]
+    year_dist = {}
+    for y in years:
+        year_dist[y] = year_dist.get(y, 0) + 1
+    # Journal distribution
+    journals = [p.get("journal") for p in papers if p.get("journal")]
+    journal_dist = {}
+    for j in journals:
+        journal_dist[j] = journal_dist.get(j, 0) + 1
+    top_journals = sorted(journal_dist.items(), key=lambda x: -x[1])[:20]
+    # Abstract coverage
+    with_abstract = sum(1 for p in papers if p.get("abstract"))
+    # Citation stats
+    citations = [p.get("citation_count", 0) for p in papers if p.get("citation_count")]
+    return {
+        "paper_count": len(papers),
+        "year_range": {
+            "min": min(years) if years else None,
+            "max": max(years) if years else None,
+        },
+        "year_distribution": dict(sorted(year_dist.items())),
+        "with_abstract": with_abstract,
+        "abstract_coverage": round(with_abstract / len(papers) * 100, 1)
+        if papers
+        else 0,
+        "top_journals": [{"journal": j, "count": c} for j, c in top_journals],
+        "citation_stats": {
+            "total": sum(citations),
+            "mean": round(sum(citations) / len(citations), 1) if citations else 0,
+            "max": max(citations) if citations else 0,
+        }
+        if citations
+        else None,
+    }
+def info(name: str, user_id: _Optional[str] = None) -> CacheInfo:
+    """Get cache information.
+    Args:
+        name: Cache name
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        CacheInfo object
+    """
+    cache_file = _cache_path(name, user_id)
+    if not cache_file.exists():
+        raise FileNotFoundError(f"Cache not found: {name}")
+    meta_file = _meta_path(name, user_id)
+    meta = {}
+    if meta_file.exists():
+        with open(meta_file) as f:
+            meta = _json.load(f)
+    papers = load(name, user_id=user_id)
+    return CacheInfo(
+        name=name,
+        path=str(cache_file),
+        size_bytes=cache_file.stat().st_size,
+        paper_count=len(papers),
+        created_at=meta.get("created_at", "unknown"),
+        query=meta.get("query"),
+    )
+def exists(name: str, user_id: _Optional[str] = None) -> bool:
+    """Check if cache exists.
+    Args:
+        name: Cache name
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        True if cache exists
+    """
+    return _cache_path(name, user_id).exists()
+def list_caches(user_id: _Optional[str] = None) -> _List[CacheInfo]:
+    """List all available caches.
+    Args:
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        List of CacheInfo objects
+    """
+    cache_dir = _get_cache_dir(user_id)
+    caches = []
+    for f in cache_dir.glob("*.json"):
+        if f.name.endswith(".meta.json"):
+            continue
+        name = f.stem
+        try:
+            caches.append(info(name, user_id=user_id))
+        except Exception:
+            pass
+    return sorted(caches, key=lambda c: c.name)
+def delete(name: str, user_id: _Optional[str] = None) -> bool:
+    """Delete a cache.
+    Args:
+        name: Cache name
+        user_id: _Optional user ID for multi-tenant scoping
+    Returns:
+        True if deleted
+    """
+    cache_file = _cache_path(name, user_id)
+    meta_file = _meta_path(name, user_id)
+    deleted = False
+    if cache_file.exists():
+        cache_file.unlink()
+        deleted = True
+    if meta_file.exists():
+        meta_file.unlink()
+    return deleted
+# Re-export from cache_export for backwards compatibility
+from ._cache.export import export

crossref-local 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

crossref-local 0.3.1py3-none-any.whl → 0.5.0py3-none-any.whl