PyPI - openalex-local - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

openalex-local 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

openalex_local/__init__.py +28 -7
openalex_local/_cache/__init__.py +45 -0
openalex_local/_cache/core.py +298 -0
openalex_local/_cache/export.py +100 -0
openalex_local/_cache/models.py +17 -0
openalex_local/_cache/utils.py +85 -0
openalex_local/_cli/__init__.py +9 -0
openalex_local/_cli/cli.py +409 -0
openalex_local/_cli/cli_cache.py +220 -0
openalex_local/_cli/mcp.py +210 -0
openalex_local/_cli/mcp_server.py +235 -0
openalex_local/_core/__init__.py +42 -0
openalex_local/{api.py → _core/api.py} +137 -19
openalex_local/_core/config.py +120 -0
openalex_local/{db.py → _core/db.py} +53 -0
openalex_local/_core/export.py +252 -0
openalex_local/{models.py → _core/models.py} +201 -0
openalex_local/_remote/__init__.py +34 -0
openalex_local/_remote/base.py +256 -0
openalex_local/_server/__init__.py +117 -0
openalex_local/_server/routes.py +175 -0
openalex_local/aio.py +259 -0
openalex_local/cache.py +31 -0
openalex_local/cli.py +4 -205
openalex_local/jobs.py +169 -0
openalex_local/remote.py +8 -0
openalex_local/server.py +8 -0
openalex_local-0.3.1.dist-info/METADATA +288 -0
openalex_local-0.3.1.dist-info/RECORD +34 -0
openalex_local-0.3.1.dist-info/entry_points.txt +2 -0
openalex_local/config.py +0 -182
openalex_local-0.3.0.dist-info/METADATA +0 -152
openalex_local-0.3.0.dist-info/RECORD +0 -13
openalex_local-0.3.0.dist-info/entry_points.txt +0 -2
/openalex_local/{fts.py → _core/fts.py} +0 -0
{openalex_local-0.3.0.dist-info → openalex_local-0.3.1.dist-info}/WHEEL +0 -0
{openalex_local-0.3.0.dist-info → openalex_local-0.3.1.dist-info}/top_level.txt +0 -0

openalex_local/__init__.py CHANGED Viewed

@@ -8,23 +8,34 @@ Example:
     >>> work = get("10.1038/nature12373")  # or DOI
 """
-__version__ = "0.3.0"
+__version__ = "0.3.1"
-from .api import (
-    Config,
+from ._core import (
+    SUPPORTED_FORMATS,
     SearchResult,
     Work,
     configure,
-    configure_http,
     count,
+    enrich,
+    enrich_ids,
     exists,
     get,
     get_many,
     get_mode,
     info,
+    save,
     search,
 )
+# Jobs module (public functions only)
+from . import jobs
+# Async module
+from . import aio
+# Cache module
+from . import cache
 __all__ = [
     # Core functions
     "search",
@@ -33,12 +44,22 @@ __all__ = [
     "get_many",
     "exists",
     "info",
+    # Enrich functions
+    "enrich",
+    "enrich_ids",
     # Configuration
     "configure",
-    "configure_http",
     "get_mode",
-    # Classes
+    # Models
     "Work",
     "SearchResult",
-    "Config",
+    # Export
+    "save",
+    "SUPPORTED_FORMATS",
+    # Jobs
+    "jobs",
+    # Async
+    "aio",
+    # Cache
+    "cache",
 ]

openalex_local/_cache/__init__.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Cache module for openalex_local.
+Provides local caching of search results and works for offline analysis.
+Example:
+    >>> from openalex_local import cache
+    >>> # Create a cache from search
+    >>> info = cache.create("ml_papers", query="machine learning", limit=1000)
+    >>> print(f"Cached {info.count} papers")
+    >>>
+    >>> # Query the cache
+    >>> papers = cache.query("ml_papers", year_min=2020)
+    >>> # Get IDs for further processing
+    >>> ids = cache.query_ids("ml_papers")
+"""
+from .models import CacheInfo
+from .core import (
+    create,
+    append,
+    load,
+    query,
+    query_ids,
+    stats,
+    info,
+    exists,
+    list_caches,
+    delete,
+)
+from .export import export
+__all__ = [
+    "CacheInfo",
+    "create",
+    "append",
+    "load",
+    "query",
+    "query_ids",
+    "stats",
+    "info",
+    "exists",
+    "list_caches",
+    "delete",
+    "export",
+]

openalex_local/_cache/core.py ADDED Viewed

@@ -0,0 +1,298 @@
+"""Core cache operations."""
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from .models import CacheInfo
+from .utils import (
+    ensure_cache_dir,
+    get_cache_dir,
+    get_cache_path,
+    validate_cache_name,
+)
+def _load_cache_raw(name: str) -> Dict[str, Any]:
+    """Load raw cache data."""
+    path = get_cache_path(name)
+    if not path.exists():
+        raise FileNotFoundError(f"Cache not found: {name}")
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def _save_cache_raw(name: str, data: Dict[str, Any]) -> Path:
+    """Save raw cache data."""
+    ensure_cache_dir()
+    path = get_cache_path(name)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    return path
+def create(
+    name: str,
+    query: Optional[str] = None,
+    ids: Optional[List[str]] = None,
+    papers: Optional[List[Dict]] = None,
+    limit: int = 1000,
+) -> CacheInfo:
+    """
+    Create a new cache.
+    Args:
+        name: Cache name (will be sanitized for filesystem)
+        query: Search query to populate cache
+        ids: List of OpenAlex IDs or DOIs to cache
+        papers: Pre-fetched paper dictionaries to cache
+        limit: Maximum papers to cache from query
+    Returns:
+        CacheInfo with cache details
+    """
+    from .. import search, get_many
+    error = validate_cache_name(name)
+    if error:
+        raise ValueError(error)
+    works_data = []
+    queries = []
+    if query:
+        results = search(query, limit=limit)
+        works_data.extend([w.to_dict() for w in results.works])
+        queries.append(query)
+    if ids:
+        works = get_many(ids)
+        works_data.extend([w.to_dict() for w in works])
+    if papers:
+        works_data.extend(papers)
+    # Remove duplicates by openalex_id
+    seen = set()
+    unique_works = []
+    for w in works_data:
+        oid = w.get("openalex_id")
+        if oid and oid not in seen:
+            seen.add(oid)
+            unique_works.append(w)
+    now = datetime.utcnow().isoformat()
+    cache_data = {
+        "name": name,
+        "created_at": now,
+        "updated_at": now,
+        "queries": queries,
+        "works": unique_works,
+    }
+    path = _save_cache_raw(name, cache_data)
+    return CacheInfo(
+        name=name,
+        path=str(path),
+        count=len(unique_works),
+        created_at=now,
+        updated_at=now,
+        queries=queries,
+        size_bytes=path.stat().st_size,
+    )
+def append(
+    name: str,
+    query: Optional[str] = None,
+    ids: Optional[List[str]] = None,
+    limit: int = 1000,
+) -> CacheInfo:
+    """Append works to an existing cache."""
+    from .. import search, get_many
+    cache_data = _load_cache_raw(name)
+    existing_ids = {w.get("openalex_id") for w in cache_data.get("works", [])}
+    new_works = []
+    queries = cache_data.get("queries", [])
+    if query:
+        results = search(query, limit=limit)
+        for w in results.works:
+            if w.openalex_id not in existing_ids:
+                new_works.append(w.to_dict())
+                existing_ids.add(w.openalex_id)
+        if query not in queries:
+            queries.append(query)
+    if ids:
+        works = get_many(ids)
+        for w in works:
+            if w.openalex_id not in existing_ids:
+                new_works.append(w.to_dict())
+                existing_ids.add(w.openalex_id)
+    cache_data["works"].extend(new_works)
+    cache_data["queries"] = queries
+    cache_data["updated_at"] = datetime.utcnow().isoformat()
+    path = _save_cache_raw(name, cache_data)
+    return CacheInfo(
+        name=name,
+        path=str(path),
+        count=len(cache_data["works"]),
+        created_at=cache_data.get("created_at", ""),
+        updated_at=cache_data["updated_at"],
+        queries=queries,
+        size_bytes=path.stat().st_size,
+    )
+def load(name: str) -> List[Dict]:
+    """Load all works from a cache."""
+    cache_data = _load_cache_raw(name)
+    return cache_data.get("works", [])
+def query(
+    name: str,
+    fields: Optional[List[str]] = None,
+    year_min: Optional[int] = None,
+    year_max: Optional[int] = None,
+    cited_min: Optional[int] = None,
+    has_abstract: Optional[bool] = None,
+    is_oa: Optional[bool] = None,
+    source: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> List[Dict]:
+    """Query a cache with filters."""
+    works = load(name)
+    results = []
+    for w in works:
+        if year_min and (w.get("year") or 0) < year_min:
+            continue
+        if year_max and (w.get("year") or 9999) > year_max:
+            continue
+        if cited_min and (w.get("cited_by_count") or 0) < cited_min:
+            continue
+        if has_abstract is not None:
+            has_abs = bool(w.get("abstract"))
+            if has_abstract != has_abs:
+                continue
+        if is_oa is not None and w.get("is_oa") != is_oa:
+            continue
+        if source and source.lower() not in (w.get("source") or "").lower():
+            continue
+        if fields:
+            w = {k: w.get(k) for k in fields}
+        results.append(w)
+        if limit and len(results) >= limit:
+            break
+    return results
+def query_ids(name: str) -> List[str]:
+    """Get all OpenAlex IDs from a cache."""
+    works = load(name)
+    return [w.get("openalex_id") for w in works if w.get("openalex_id")]
+def stats(name: str) -> Dict[str, Any]:
+    """Get statistics for a cache."""
+    cache_data = _load_cache_raw(name)
+    works = cache_data.get("works", [])
+    if not works:
+        return {
+            "name": name, "total": 0, "year_min": None, "year_max": None,
+            "citations_total": 0, "citations_mean": 0,
+            "with_abstract": 0, "open_access": 0, "sources": [],
+        }
+    years = [w.get("year") for w in works if w.get("year")]
+    citations = [w.get("cited_by_count") or 0 for w in works]
+    abstracts = sum(1 for w in works if w.get("abstract"))
+    oa_count = sum(1 for w in works if w.get("is_oa"))
+    source_counts: Dict[str, int] = {}
+    for w in works:
+        src = w.get("source")
+        if src:
+            source_counts[src] = source_counts.get(src, 0) + 1
+    top_sources = sorted(source_counts.items(), key=lambda x: -x[1])[:10]
+    return {
+        "name": name,
+        "total": len(works),
+        "year_min": min(years) if years else None,
+        "year_max": max(years) if years else None,
+        "citations_total": sum(citations),
+        "citations_mean": sum(citations) / len(works) if works else 0,
+        "with_abstract": abstracts,
+        "with_abstract_pct": round(100 * abstracts / len(works), 1) if works else 0,
+        "open_access": oa_count,
+        "open_access_pct": round(100 * oa_count / len(works), 1) if works else 0,
+        "sources": top_sources,
+        "queries": cache_data.get("queries", []),
+        "created_at": cache_data.get("created_at"),
+        "updated_at": cache_data.get("updated_at"),
+    }
+def info(name: str) -> CacheInfo:
+    """Get cache info."""
+    path = get_cache_path(name)
+    if not path.exists():
+        raise FileNotFoundError(f"Cache not found: {name}")
+    cache_data = _load_cache_raw(name)
+    return CacheInfo(
+        name=name,
+        path=str(path),
+        count=len(cache_data.get("works", [])),
+        created_at=cache_data.get("created_at", ""),
+        updated_at=cache_data.get("updated_at", ""),
+        queries=cache_data.get("queries", []),
+        size_bytes=path.stat().st_size,
+    )
+def exists(name: str) -> bool:
+    """Check if a cache exists."""
+    return get_cache_path(name).exists()
+def list_caches() -> List[CacheInfo]:
+    """List all caches."""
+    cache_dir = get_cache_dir()
+    if not cache_dir.exists():
+        return []
+    caches = []
+    for path in cache_dir.glob("*.json"):
+        try:
+            cache_info = info(path.stem)
+            caches.append(cache_info)
+        except (json.JSONDecodeError, KeyError):
+            continue
+    return sorted(caches, key=lambda c: c.updated_at, reverse=True)
+def delete(name: str) -> bool:
+    """Delete a cache."""
+    path = get_cache_path(name)
+    if path.exists():
+        path.unlink()
+        return True
+    return False

openalex_local/_cache/export.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Cache export functionality."""
+import csv
+import json
+from pathlib import Path
+from typing import List, Dict
+from .core import load
+def export(
+    name: str,
+    output_path: str,
+    format: str = "json",
+) -> str:
+    """
+    Export a cache to a file.
+    Args:
+        name: Cache name
+        output_path: Output file path
+        format: Export format ("json", "csv", "bibtex")
+    Returns:
+        Path to exported file
+    """
+    works = load(name)
+    output = Path(output_path)
+    if format == "json":
+        _export_json(works, output)
+    elif format == "csv":
+        _export_csv(works, output)
+    elif format == "bibtex":
+        _export_bibtex(works, output)
+    else:
+        raise ValueError(f"Unknown format: {format}. Use 'json', 'csv', or 'bibtex'")
+    return str(output)
+def _export_json(works: List[Dict], output: Path) -> None:
+    """Export to JSON format."""
+    with open(output, "w", encoding="utf-8") as f:
+        json.dump(works, f, ensure_ascii=False, indent=2)
+def _export_csv(works: List[Dict], output: Path) -> None:
+    """Export to CSV format."""
+    if not works:
+        output.write_text("")
+        return
+    # Get all unique keys
+    keys = set()
+    for w in works:
+        keys.update(w.keys())
+    # Prioritize common fields
+    priority = ["openalex_id", "doi", "title", "authors", "year", "source", "cited_by_count"]
+    fieldnames = [k for k in priority if k in keys]
+    fieldnames.extend(sorted(k for k in keys if k not in priority))
+    with open(output, "w", encoding="utf-8", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
+        writer.writeheader()
+        for w in works:
+            row = {}
+            for k, v in w.items():
+                if isinstance(v, list):
+                    row[k] = "; ".join(str(x) for x in v)
+                else:
+                    row[k] = v
+            writer.writerow(row)
+def _export_bibtex(works: List[Dict], output: Path) -> None:
+    """Export to BibTeX format."""
+    from .._core.models import Work
+    lines = []
+    for w in works:
+        work = Work(
+            openalex_id=w.get("openalex_id", ""),
+            doi=w.get("doi"),
+            title=w.get("title"),
+            authors=w.get("authors", []),
+            year=w.get("year"),
+            source=w.get("source"),
+            volume=w.get("volume"),
+            issue=w.get("issue"),
+            pages=w.get("pages"),
+            publisher=w.get("publisher"),
+            type=w.get("type"),
+            oa_url=w.get("oa_url"),
+        )
+        lines.append(work.citation("bibtex"))
+        lines.append("")
+    output.write_text("\n".join(lines), encoding="utf-8")

openalex_local/_cache/models.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Cache data models."""
+from dataclasses import dataclass, field
+from typing import List
+@dataclass
+class CacheInfo:
+    """Information about a cache."""
+    name: str
+    path: str
+    count: int
+    created_at: str
+    updated_at: str
+    queries: List[str] = field(default_factory=list)
+    size_bytes: int = 0

openalex_local/_cache/utils.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Cache utilities for openalex_local."""
+import os
+import re
+from pathlib import Path
+from typing import Optional
+# Default cache directory
+DEFAULT_CACHE_DIR = Path.home() / ".openalex_local" / "caches"
+def get_cache_dir() -> Path:
+    """Get cache directory from environment or default."""
+    env_dir = os.environ.get("OPENALEX_LOCAL_CACHE_DIR")
+    if env_dir:
+        return Path(env_dir)
+    return DEFAULT_CACHE_DIR
+def sanitize_cache_name(name: str) -> str:
+    """
+    Sanitize cache name for filesystem safety.
+    Args:
+        name: Raw cache name
+    Returns:
+        Sanitized cache name
+    Example:
+        >>> sanitize_cache_name("my cache/name!")
+        'my_cache_name_'
+    """
+    # Replace non-alphanumeric characters (except - and _) with underscore
+    sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
+    # Remove leading/trailing underscores
+    sanitized = sanitized.strip("_")
+    # Limit length
+    if len(sanitized) > 100:
+        sanitized = sanitized[:100]
+    # Ensure not empty
+    if not sanitized:
+        sanitized = "cache"
+    return sanitized
+def get_cache_path(name: str) -> Path:
+    """
+    Get full path to cache file.
+    Args:
+        name: Cache name
+    Returns:
+        Path to cache JSON file
+    """
+    cache_dir = get_cache_dir()
+    safe_name = sanitize_cache_name(name)
+    return cache_dir / f"{safe_name}.json"
+def ensure_cache_dir() -> Path:
+    """Ensure cache directory exists."""
+    cache_dir = get_cache_dir()
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+def validate_cache_name(name: str) -> Optional[str]:
+    """
+    Validate cache name and return error message if invalid.
+    Args:
+        name: Cache name to validate
+    Returns:
+        Error message if invalid, None if valid
+    """
+    if not name:
+        return "Cache name cannot be empty"
+    if len(name) > 100:
+        return "Cache name too long (max 100 characters)"
+    if name.startswith("."):
+        return "Cache name cannot start with '.'"
+    return None

openalex_local/_cli/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+"""Internal CLI modules."""
+from .cli import cli, main
+from .mcp import mcp, run_mcp_server
+__all__ = ["cli", "main", "mcp", "run_mcp_server"]
+# EOF

openalex-local 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

openalex-local 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl