PyPI - citations-collector - Versions diffs - 0.2.3__py3-none-any.whl - Mend

citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

citations_collector/__init__.py +18 -0
citations_collector/_version.py +34 -0
citations_collector/cli.py +525 -0
citations_collector/core.py +503 -0
citations_collector/discovery/__init__.py +17 -0
citations_collector/discovery/base.py +26 -0
citations_collector/discovery/crossref.py +210 -0
citations_collector/discovery/datacite.py +260 -0
citations_collector/discovery/openalex.py +252 -0
citations_collector/discovery/opencitations.py +168 -0
citations_collector/discovery/utils.py +62 -0
citations_collector/importers/__init__.py +17 -0
citations_collector/importers/bibtex.py +178 -0
citations_collector/importers/dandi.py +314 -0
citations_collector/importers/github.py +147 -0
citations_collector/importers/zenodo.py +110 -0
citations_collector/importers/zotero.py +262 -0
citations_collector/merge_detection.py +216 -0
citations_collector/models/__init__.py +44 -0
citations_collector/models/generated.py +525 -0
citations_collector/pdf.py +260 -0
citations_collector/persistence/__init__.py +7 -0
citations_collector/persistence/tsv_io.py +121 -0
citations_collector/persistence/yaml_io.py +50 -0
citations_collector/py.typed +0 -0
citations_collector/unpaywall.py +60 -0
citations_collector/zotero_sync.py +591 -0
citations_collector-0.2.3.dist-info/METADATA +456 -0
citations_collector-0.2.3.dist-info/RECORD +31 -0
citations_collector-0.2.3.dist-info/WHEEL +4 -0
citations_collector-0.2.3.dist-info/entry_points.txt +2 -0

citations_collector/pdf.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""PDF acquisition and management."""
+from __future__ import annotations
+import logging
+import subprocess
+import time
+from collections.abc import Mapping
+from pathlib import Path
+import requests
+from requests.adapters import HTTPAdapter
+from requests.models import PreparedRequest, Response
+from urllib3.util.retry import Retry
+from citations_collector.models import CitationRecord
+from citations_collector.unpaywall import UnpaywallClient
+logger = logging.getLogger(__name__)
+class RetryAfterAdapter(HTTPAdapter):
+    """HTTPAdapter that respects Retry-After header from server."""
+    def send(
+        self,
+        request: PreparedRequest,
+        stream: bool = False,
+        timeout: float | tuple[float, float] | tuple[float, None] | None = None,
+        verify: bool | str = True,
+        cert: bytes | str | tuple[bytes | str, bytes | str] | None = None,
+        proxies: Mapping[str, str] | None = None,
+    ) -> Response:
+        """Send request with Retry-After header support."""
+        response = super().send(
+            request,
+            stream=stream,
+            timeout=timeout,
+            verify=verify,
+            cert=cert,
+            proxies=proxies,
+        )
+        # Check for Retry-After header on 429/503 responses
+        if response.status_code in (429, 503):
+            retry_after = response.headers.get("Retry-After")
+            if retry_after:
+                try:
+                    # Retry-After can be seconds (int) or HTTP date
+                    delay = int(retry_after)
+                    logger.warning(
+                        f"Rate limited by {request.url}, waiting {delay}s (Retry-After header)"
+                    )
+                    time.sleep(delay)
+                except ValueError:
+                    # HTTP date format - default to 60s
+                    logger.warning(f"Rate limited by {request.url}, waiting 60s")
+                    time.sleep(60)
+        return response
+class PDFAcquirer:
+    def __init__(
+        self,
+        output_dir: Path = Path("pdfs"),
+        email: str = "site-unpaywall@oneukrainian.com",
+        git_annex: bool = False,
+    ) -> None:
+        self.output_dir = Path(output_dir)
+        self.unpaywall = UnpaywallClient(email=email)
+        self.git_annex = git_annex
+        # Create session with retry logic and proper User-Agent
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                "User-Agent": f"citations-collector/0.2 (mailto:{email})",
+                "Accept": "application/pdf,*/*",
+            }
+        )
+        # Retry on 403, 429, 500, 502, 503, 504 with exponential backoff
+        # Longer backoff for bioRxiv/Cloudflare protection: 2s, 6s, 18s, 54s
+        retry_strategy = Retry(
+            total=4,
+            backoff_factor=3,  # 3^0=1s, 3^1=3s, 3^2=9s, 3^3=27s (with backoff_factor multiplier)
+            status_forcelist=[403, 429, 500, 502, 503, 504],
+            allowed_methods=["GET", "HEAD"],
+            respect_retry_after_header=True,  # Respect Retry-After from server
+        )
+        adapter = RetryAfterAdapter(max_retries=retry_strategy)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+        # Rate limiting: delay between downloads to avoid triggering Cloudflare
+        self._last_download_time = 0.0
+        self._download_delay = 2.0  # 2 seconds between downloads
+    def acquire_for_citation(self, citation: CitationRecord, dry_run: bool = False) -> bool:
+        """Look up OA status, download PDF if available. Returns True if PDF was acquired."""
+        if not citation.citation_doi:
+            return False
+        result = self.unpaywall.lookup(citation.citation_doi)
+        citation.oa_status = result.oa_status
+        citation.pdf_url = result.best_oa_url
+        if not result.best_oa_url or not result.is_oa:
+            return False
+        pdf_path = self._doi_to_path(citation.citation_doi)
+        if dry_run:
+            logger.info("Would download %s -> %s", citation.citation_doi, pdf_path)
+            return False
+        # Skip if already downloaded (check both .pdf and .html extensions)
+        full_path = self.output_dir / pdf_path
+        html_path = full_path.with_suffix(".html")
+        if full_path.exists():
+            citation.pdf_path = str(full_path)
+            logger.debug(f"PDF already exists: {full_path}")
+            return False
+        if html_path.exists():
+            citation.pdf_path = str(html_path)
+            logger.debug(f"HTML already exists: {html_path}")
+            return False
+        # Download PDF (or HTML if server returns that)
+        actual_path = self._download(result.best_oa_url, full_path)
+        if actual_path:
+            citation.pdf_path = str(actual_path)
+            # Also fetch BibTeX
+            self._fetch_bibtex(citation.citation_doi, actual_path.parent / "article.bib")
+            # git-annex
+            if self.git_annex:
+                self._annex_addurl(actual_path, result.best_oa_url)
+            return True
+        return False
+    def acquire_all(
+        self,
+        citations: list[CitationRecord],
+        dry_run: bool = False,
+    ) -> dict[str, int]:
+        """Process all citations. Returns counts dict."""
+        counts = {"downloaded": 0, "skipped": 0, "no_oa": 0, "no_doi": 0, "error": 0}
+        seen_dois: set[str] = set()
+        for citation in citations:
+            if not citation.citation_doi:
+                counts["no_doi"] += 1
+                continue
+            if citation.citation_doi in seen_dois:
+                # Copy fields from first citation with same DOI
+                for prev in citations:
+                    if prev.citation_doi == citation.citation_doi and prev.oa_status:
+                        citation.oa_status = prev.oa_status
+                        citation.pdf_url = prev.pdf_url
+                        citation.pdf_path = prev.pdf_path
+                        break
+                counts["skipped"] += 1
+                continue
+            seen_dois.add(citation.citation_doi)
+            if citation.pdf_path and Path(citation.pdf_path).exists():
+                counts["skipped"] += 1
+                continue
+            try:
+                if self.acquire_for_citation(citation, dry_run=dry_run):
+                    counts["downloaded"] += 1
+                elif citation.oa_status == "closed" or not citation.pdf_url:
+                    counts["no_oa"] += 1
+                else:
+                    counts["skipped"] += 1
+            except Exception:
+                logger.exception("Error acquiring PDF for %s", citation.citation_doi)
+                counts["error"] += 1
+        return counts
+    def _doi_to_path(self, doi: str) -> Path:
+        """Convert DOI to relative path: 10.1038/s41597-023-02214-y -> 10.1038/.../article.pdf"""
+        return Path(doi) / "article.pdf"
+    def _download(self, url: str, dest: Path) -> Path | None:
+        """
+        Download URL to dest with retry logic and content-type detection.
+        If server returns HTML instead of PDF, saves with .html extension.
+        Returns actual path on success, None on failure.
+        """
+        # Rate limiting: wait between downloads to avoid triggering Cloudflare
+        elapsed = time.time() - self._last_download_time
+        if elapsed < self._download_delay:
+            time.sleep(self._download_delay - elapsed)
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            self._last_download_time = time.time()
+            resp = self.session.get(url, timeout=60, stream=True)
+            resp.raise_for_status()
+            # Check Content-Type to detect HTML vs PDF
+            content_type = resp.headers.get("Content-Type", "").lower()
+            is_html = any(
+                html_type in content_type
+                for html_type in ["text/html", "application/xhtml+xml", "text/xml"]
+            )
+            # If HTML detected, change extension
+            if is_html:
+                dest = dest.with_suffix(".html")
+                logger.warning(
+                    "Server returned HTML instead of PDF for %s, saving as %s",
+                    url,
+                    dest.name,
+                )
+            with open(dest, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            logger.info("Downloaded %s", dest)
+            return dest
+        except requests.RequestException as e:
+            logger.warning("Download failed for %s: %s", url, e)
+            if dest.exists():
+                dest.unlink()
+            return None
+    def _fetch_bibtex(self, doi: str, dest: Path) -> None:
+        """Fetch BibTeX via DOI content negotiation."""
+        try:
+            resp = requests.get(
+                f"https://doi.org/{doi}",
+                headers={"Accept": "application/x-bibtex"},
+                timeout=30,
+                allow_redirects=True,
+            )
+            if resp.status_code == 200 and resp.text.strip():
+                dest.write_text(resp.text)
+                logger.info("Saved BibTeX to %s", dest)
+        except requests.RequestException as e:
+            logger.warning("BibTeX fetch failed for %s: %s", doi, e)
+    def _annex_addurl(self, path: Path, url: str) -> None:
+        """Register URL with git-annex."""
+        try:
+            subprocess.run(
+                ["git", "annex", "addurl", "--file", str(path), url],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            logger.info("git annex addurl for %s", path)
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            logger.warning("git annex addurl failed for %s: %s", path, e)

citations_collector/persistence/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Persistence layer for loading/saving collections and citations."""
+from __future__ import annotations
+from citations_collector.persistence import tsv_io, yaml_io
+__all__ = ["yaml_io", "tsv_io"]

citations_collector/persistence/tsv_io.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Load and save TSV citation files."""
+from __future__ import annotations
+import csv
+from contextlib import suppress
+from pathlib import Path
+from citations_collector.models import CitationRecord
+# TSV column order matching examples/citations-example.tsv
+TSV_COLUMNS = [
+    "item_id",
+    "item_flavor",
+    "item_ref_type",
+    "item_ref_value",
+    "item_name",
+    "citation_doi",
+    "citation_pmid",
+    "citation_arxiv",
+    "citation_url",
+    "citation_title",
+    "citation_authors",
+    "citation_year",
+    "citation_journal",
+    "citation_relationship",
+    "citation_type",
+    "citation_sources",  # Plural - can contain comma-separated values
+    "discovered_date",
+    "citation_status",
+    "citation_merged_into",
+    "citation_comment",
+    "curated_by",
+    "curated_date",
+    "oa_status",
+    "pdf_url",
+    "pdf_path",
+]
+def load_citations(path: Path) -> list[CitationRecord]:
+    """
+    Load citations from TSV file.
+    Args:
+        path: Path to TSV file
+    Returns:
+        List of CitationRecord objects
+    Raises:
+        FileNotFoundError: If file doesn't exist
+    """
+    citations = []
+    with open(path, newline="") as f:
+        reader = csv.DictReader(f, delimiter="\t")
+        for row in reader:
+            # Remove empty string values (treat as None)
+            cleaned = {k: (v if v != "" else None) for k, v in row.items()}
+            # Convert year to int if present
+            if cleaned.get("citation_year"):
+                with suppress(ValueError):
+                    cleaned["citation_year"] = int(cleaned["citation_year"])  # type: ignore[arg-type]
+            # Parse citation_sources from TSV (comma-separated)
+            # Support both old "citation_source" and new "citation_sources" columns
+            sources_field = cleaned.get("citation_sources") or cleaned.get("citation_source")
+            if sources_field and "," in str(sources_field):
+                # Multiple sources - parse into list
+                sources = [s.strip() for s in sources_field.split(",")]
+                cleaned["citation_sources"] = sources
+                # Set citation_source to first (required field, backward compat)
+                cleaned["citation_source"] = sources[0]
+            elif sources_field:
+                # Single source - still create list for consistency
+                cleaned["citation_sources"] = [sources_field]
+                cleaned["citation_source"] = sources_field
+            else:
+                # No source field - set default for backward compatibility
+                # This can happen with old TSV files or test data
+                # Use "manual" as it's the appropriate enum value for unspecified sources
+                cleaned["citation_source"] = "manual"
+                cleaned["citation_sources"] = ["manual"]
+            # Create CitationRecord, only including fields that are in the model
+            citation = CitationRecord(**cleaned)  # type: ignore[arg-type]
+            citations.append(citation)
+    return citations
+def save_citations(citations: list[CitationRecord], path: Path) -> None:
+    """
+    Save citations to TSV file.
+    Args:
+        citations: List of CitationRecord objects
+        path: Path to output TSV file
+    """
+    with open(path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=TSV_COLUMNS, delimiter="\t", extrasaction="ignore")
+        writer.writeheader()
+        for citation in citations:
+            # Convert to dict
+            data = citation.model_dump(exclude_none=False, mode="python")
+            # Serialize citation_sources list to comma-separated string
+            if "citation_sources" in data and data["citation_sources"]:
+                data["citation_sources"] = ", ".join(data["citation_sources"])
+            # Remove citation_source (singular, deprecated field) from output
+            if "citation_source" in data:
+                del data["citation_source"]
+            # Convert None to empty string for TSV
+            cleaned = {k: ("" if v is None else str(v)) for k, v in data.items()}
+            writer.writerow(cleaned)

citations_collector/persistence/yaml_io.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Load and save YAML collection files."""
+from __future__ import annotations
+from pathlib import Path
+import yaml
+from citations_collector.models import Collection
+def load_collection(path: Path) -> Collection:
+    """
+    Load collection from YAML file.
+    Args:
+        path: Path to YAML file
+    Returns:
+        Collection object
+    Raises:
+        FileNotFoundError: If file doesn't exist
+        ValidationError: If YAML doesn't match schema
+    """
+    with open(path) as f:
+        data = yaml.safe_load(f)
+    return Collection(**data)
+def save_collection(collection: Collection, path: Path) -> None:
+    """
+    Save collection to YAML file.
+    Args:
+        collection: Collection object to save
+        path: Path to output YAML file
+    """
+    # Convert to dict, excluding None values for cleaner output
+    data = collection.model_dump(exclude_none=True, mode="python")
+    with open(path, "w") as f:
+        yaml.safe_dump(
+            data,
+            f,
+            default_flow_style=False,
+            sort_keys=False,
+            allow_unicode=True,
+        )

citations_collector/py.typed ADDED Viewed

File without changes

citations_collector/unpaywall.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Unpaywall API client for open access PDF discovery."""
+from __future__ import annotations
+import logging
+import time
+from dataclasses import dataclass
+import requests
+logger = logging.getLogger(__name__)
+@dataclass
+class UnpaywallResult:
+    doi: str
+    is_oa: bool
+    oa_status: str  # gold/green/bronze/hybrid/closed
+    best_oa_url: str | None  # direct PDF URL
+    license: str | None
+class UnpaywallClient:
+    BASE_URL = "https://api.unpaywall.org/v2/"
+    def __init__(self, email: str = "site-unpaywall@oneukrainian.com") -> None:
+        self.email = email
+        self._last_request_time = 0.0
+    def lookup(self, doi: str) -> UnpaywallResult:
+        """Look up OA status and PDF URL for a DOI."""
+        self._rate_limit()
+        url = f"{self.BASE_URL}{doi}"
+        try:
+            resp = requests.get(url, params={"email": self.email}, timeout=30)
+            if resp.status_code == 404:
+                return UnpaywallResult(
+                    doi=doi, is_oa=False, oa_status="closed", best_oa_url=None, license=None
+                )
+            resp.raise_for_status()
+            data = resp.json()
+            best_loc = data.get("best_oa_location") or {}
+            return UnpaywallResult(
+                doi=doi,
+                is_oa=data.get("is_oa", False),
+                oa_status=data.get("oa_status", "closed") or "closed",
+                best_oa_url=best_loc.get("url_for_pdf") or best_loc.get("url"),
+                license=best_loc.get("license"),
+            )
+        except requests.RequestException as e:
+            logger.warning("Unpaywall lookup failed for %s: %s", doi, e)
+            return UnpaywallResult(
+                doi=doi, is_oa=False, oa_status="closed", best_oa_url=None, license=None
+            )
+    def _rate_limit(self) -> None:
+        elapsed = time.monotonic() - self._last_request_time
+        if elapsed < 0.1:
+            time.sleep(0.1 - elapsed)
+        self._last_request_time = time.monotonic()