PyPI - citations-collector - Versions diffs - 0.2.3__py3-none-any.whl - Mend

citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

citations_collector/__init__.py +18 -0
citations_collector/_version.py +34 -0
citations_collector/cli.py +525 -0
citations_collector/core.py +503 -0
citations_collector/discovery/__init__.py +17 -0
citations_collector/discovery/base.py +26 -0
citations_collector/discovery/crossref.py +210 -0
citations_collector/discovery/datacite.py +260 -0
citations_collector/discovery/openalex.py +252 -0
citations_collector/discovery/opencitations.py +168 -0
citations_collector/discovery/utils.py +62 -0
citations_collector/importers/__init__.py +17 -0
citations_collector/importers/bibtex.py +178 -0
citations_collector/importers/dandi.py +314 -0
citations_collector/importers/github.py +147 -0
citations_collector/importers/zenodo.py +110 -0
citations_collector/importers/zotero.py +262 -0
citations_collector/merge_detection.py +216 -0
citations_collector/models/__init__.py +44 -0
citations_collector/models/generated.py +525 -0
citations_collector/pdf.py +260 -0
citations_collector/persistence/__init__.py +7 -0
citations_collector/persistence/tsv_io.py +121 -0
citations_collector/persistence/yaml_io.py +50 -0
citations_collector/py.typed +0 -0
citations_collector/unpaywall.py +60 -0
citations_collector/zotero_sync.py +591 -0
citations_collector-0.2.3.dist-info/METADATA +456 -0
citations_collector-0.2.3.dist-info/RECORD +31 -0
citations_collector-0.2.3.dist-info/WHEEL +4 -0
citations_collector-0.2.3.dist-info/entry_points.txt +2 -0

citations_collector/importers/zotero.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Import items from Zotero groups/collections."""
+from __future__ import annotations
+import logging
+import os
+import re
+from datetime import date
+from typing import Any, cast
+from pyzotero import zotero
+from citations_collector.models import Collection, Item, ItemFlavor, ItemRef, RefType
+logger = logging.getLogger(__name__)
+class ZoteroImporter:
+    """
+    Import items from Zotero group or collection.
+    Extracts DOIs and other identifiers from Zotero item metadata
+    to create a Collection for citation tracking.
+    Example:
+        importer = ZoteroImporter(api_key="your-api-key")
+        collection = importer.import_group(group_id=5774211)
+    """
+    DOI_PATTERN = re.compile(r"^10\.\d{4,}/[^\s]+$")
+    def __init__(self, api_key: str | None = None) -> None:
+        """
+        Initialize Zotero importer.
+        Args:
+            api_key: Zotero API key. If not provided, reads from
+                    ZOTERO_API_KEY environment variable.
+                    Public groups can be read without an API key.
+        """
+        self.api_key = api_key or os.getenv("ZOTERO_API_KEY")
+    def import_group(
+        self,
+        group_id: int,
+        collection_key: str | None = None,
+        limit: int | None = None,
+    ) -> Collection:
+        """
+        Import items from a Zotero group.
+        Args:
+            group_id: Zotero group ID
+            collection_key: Optional collection key within the group.
+                           If None, imports all items in the group.
+            limit: Optional limit on number of items to import.
+        Returns:
+            Collection with items extracted from Zotero.
+            Each item gets:
+            - item_id: "zotero:{item_key}" or DOI-based ID if available
+            - flavor_id: "main" (single flavor per item)
+            - ref: DOI extracted from Zotero metadata
+        """
+        # Initialize pyzotero client
+        zot = zotero.Zotero(group_id, "group", self.api_key)
+        # Fetch items
+        if collection_key:
+            raw_items = self._fetch_collection_items(zot, collection_key, limit)
+            collection_name = self._get_collection_name(zot, collection_key)
+        else:
+            raw_items = self._fetch_all_items(zot, limit)
+            collection_name = self._get_group_name(zot, group_id)
+        # Convert to Collection items
+        items: list[Item] = []
+        for raw_item in raw_items:
+            item = self._zotero_item_to_item(raw_item)
+            if item:
+                items.append(item)
+        logger.info(f"Imported {len(items)} items from Zotero group {group_id}")
+        return Collection(
+            name=collection_name or f"Zotero Group {group_id}",
+            description=f"Items imported from Zotero group {group_id}",
+            homepage=f"https://www.zotero.org/groups/{group_id}",
+            source_type="zotero",
+            zotero_group_id=group_id,
+            zotero_collection_key=collection_key,
+            items=items,
+        )
+    def _fetch_all_items(
+        self, zot: zotero.Zotero, limit: int | None = None
+    ) -> list[dict[str, Any]]:
+        """Fetch all items from a Zotero library."""
+        try:
+            if limit:
+                return cast(list[dict[str, Any]], zot.items(limit=limit))
+            else:
+                return cast(list[dict[str, Any]], zot.everything(zot.items()))
+        except Exception as e:
+            logger.error(f"Failed to fetch Zotero items: {e}")
+            return []
+    def _fetch_collection_items(
+        self, zot: zotero.Zotero, collection_key: str, limit: int | None = None
+    ) -> list[dict[str, Any]]:
+        """Fetch items from a specific collection."""
+        try:
+            if limit:
+                return cast(list[dict[str, Any]], zot.collection_items(collection_key, limit=limit))
+            else:
+                return cast(
+                    list[dict[str, Any]], zot.everything(zot.collection_items(collection_key))
+                )
+        except Exception as e:
+            logger.error(f"Failed to fetch collection {collection_key}: {e}")
+            return []
+    def _get_collection_name(self, zot: zotero.Zotero, collection_key: str) -> str | None:
+        """Get the name of a collection."""
+        try:
+            collection = zot.collection(collection_key)
+            name: str | None = collection.get("data", {}).get("name")
+            return name
+        except Exception:
+            return None
+    def _get_group_name(self, zot: zotero.Zotero, group_id: int) -> str | None:
+        """Get the name of a group."""
+        # pyzotero doesn't have a direct group info method for group libraries
+        # Return None to use fallback
+        return None
+    def _zotero_item_to_item(self, raw_item: dict) -> Item | None:
+        """
+        Convert a Zotero item to a Collection Item.
+        Args:
+            raw_item: Raw item data from pyzotero
+        Returns:
+            Item with DOI ref if available, None if no usable identifier
+        """
+        data = raw_item.get("data", {})
+        item_key = raw_item.get("key", "")
+        # Skip attachments and notes
+        item_type = data.get("itemType", "")
+        if item_type in ("attachment", "note"):
+            return None
+        # Extract DOI
+        doi = self._extract_doi(data)
+        # Extract other identifiers
+        pmid = data.get("extra", "")
+        pmid_match = re.search(r"PMID:\s*(\d+)", pmid)
+        pmid_value = pmid_match.group(1) if pmid_match else None
+        # Build refs list
+        refs: list[ItemRef] = []
+        if doi:
+            refs.append(ItemRef(ref_type=RefType.doi, ref_value=doi))
+        if pmid_value:
+            refs.append(ItemRef(ref_type=RefType.pmid, ref_value=pmid_value))
+        # If no usable refs, use URL if available
+        if not refs:
+            url = data.get("url")
+            if url:
+                refs.append(ItemRef(ref_type=RefType.url, ref_value=url))
+            else:
+                # No usable identifier
+                logger.debug(f"Skipping Zotero item {item_key}: no DOI, PMID, or URL")
+                return None
+        # Determine item_id (prefer DOI-based ID)
+        item_id = f"doi:{doi}" if doi else f"zotero:{item_key}"
+        # Extract metadata
+        title = data.get("title", f"Untitled ({item_key})")
+        url = data.get("url")
+        # Parse date
+        date_str = data.get("date", "")
+        release_date = self._parse_date(date_str)
+        # Create single flavor for the item
+        flavor = ItemFlavor(
+            flavor_id="main",
+            name=title,
+            release_date=release_date,
+            refs=refs,
+        )
+        return Item(
+            item_id=item_id,
+            name=title,
+            homepage=url,
+            flavors=[flavor],
+        )
+    def _extract_doi(self, data: dict[str, Any]) -> str | None:
+        """
+        Extract DOI from Zotero item data.
+        Checks multiple fields where DOI might be stored.
+        """
+        # Check dedicated DOI field
+        doi: str = data.get("DOI", "")
+        if doi and self.DOI_PATTERN.match(doi):
+            return doi
+        # Check URL field for DOI URLs
+        url = data.get("url", "")
+        if "doi.org/" in url:
+            # Extract DOI from URL
+            match = re.search(r"doi\.org/(10\.\d{4,}/[^\s]+)", url)
+            if match:
+                return match.group(1)
+        # Check extra field for DOI
+        extra = data.get("extra", "")
+        match = re.search(r"DOI:\s*(10\.\d{4,}/[^\s]+)", extra, re.IGNORECASE)
+        if match:
+            return match.group(1)
+        return None
+    def _parse_date(self, date_str: str) -> date | None:
+        """Parse various date formats from Zotero."""
+        if not date_str:
+            return None
+        # Try ISO format first
+        try:
+            return date.fromisoformat(date_str[:10])
+        except ValueError:
+            pass
+        # Try year-only
+        match = re.match(r"^(\d{4})$", date_str)
+        if match:
+            return date(int(match.group(1)), 1, 1)
+        # Try "Month Day, Year" format
+        try:
+            from datetime import datetime
+            for fmt in ["%B %d, %Y", "%b %d, %Y", "%Y-%m-%d", "%Y/%m/%d"]:
+                try:
+                    return datetime.strptime(date_str, fmt).date()
+                except ValueError:
+                    continue
+        except Exception:
+            pass
+        return None

citations_collector/merge_detection.py ADDED Viewed

@@ -0,0 +1,216 @@
+"""Detect and mark merged citations (preprints with published versions)."""
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING
+import requests
+from rapidfuzz import fuzz
+if TYPE_CHECKING:
+    from citations_collector.models.generated import CitationRecord
+from citations_collector.models.generated import CitationStatus
+logger = logging.getLogger(__name__)
+class MergeDetector:
+    """Detect preprints that have published versions and mark them as merged."""
+    def __init__(self, email: str = "site-unpaywall@oneukrainian.com", timeout: int = 30):
+        """Initialize the merge detector.
+        Args:
+            email: Email for CrossRef API (polite pool)
+            timeout: HTTP request timeout in seconds
+        """
+        self.email = email
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": f"citations-collector ({email})"})
+    def detect_merged_pairs(self, citations: list[CitationRecord]) -> dict[str, str]:
+        """Detect which citations are preprints with published versions.
+        Args:
+            citations: List of citation records to analyze
+        Returns:
+            Dictionary mapping preprint DOI -> published DOI
+        """
+        merged_pairs: dict[str, str] = {}
+        doi_to_citation = {c.citation_doi: c for c in citations if c.citation_doi}
+        for citation in citations:
+            if not citation.citation_doi:
+                continue
+            # Check if this is a preprint with a published version
+            published_doi = self._get_published_version(citation.citation_doi)
+            if published_doi and (
+                published_doi in doi_to_citation or self._verify_doi_exists(published_doi)
+            ):
+                merged_pairs[citation.citation_doi] = published_doi
+                logger.info(f"Detected merge: {citation.citation_doi} -> {published_doi}")
+        return merged_pairs
+    def _get_published_version(self, doi: str) -> str | None:
+        """Get the published version DOI for a preprint.
+        Args:
+            doi: DOI of the potential preprint
+        Returns:
+            DOI of published version, or None if not found
+        """
+        try:
+            # Query CrossRef for this DOI's metadata
+            url = f"https://api.crossref.org/works/{doi}"
+            params = {"mailto": self.email}
+            resp = self.session.get(url, params=params, timeout=self.timeout)
+            resp.raise_for_status()
+            data = resp.json()
+            message = data.get("message", {})
+            # Check for "is-preprint-of" relationship
+            relations = message.get("relation", {})
+            is_preprint_of = relations.get("is-preprint-of", [])
+            for rel in is_preprint_of:
+                if "id" in rel:
+                    # Extract DOI from the full URL if needed
+                    rel_id = str(rel["id"])
+                    if rel_id.startswith("https://doi.org/"):
+                        return str(rel_id.replace("https://doi.org/", ""))
+                    elif rel_id.startswith("http://dx.doi.org/"):
+                        return str(rel_id.replace("http://dx.doi.org/", ""))
+                    return str(rel_id)
+            # Check if this is a bioRxiv/medRxiv preprint (common case)
+            # Sometimes the relationship isn't explicit but the DOI pattern helps
+            if self._is_preprint_server(doi):
+                # Try to find via title fuzzy matching in our dataset
+                # (this is a fallback and should be used carefully)
+                pass
+        except requests.RequestException as e:
+            logger.warning(f"Failed to check CrossRef for {doi}: {e}")
+        except (KeyError, ValueError) as e:
+            logger.warning(f"Unexpected CrossRef response format for {doi}: {e}")
+        return None
+    def _is_preprint_server(self, doi: str) -> bool:
+        """Check if DOI is from a known preprint server.
+        Args:
+            doi: DOI to check
+        Returns:
+            True if from a preprint server
+        """
+        preprint_prefixes = [
+            "10.1101/",  # bioRxiv, medRxiv
+            "10.31219/",  # OSF Preprints
+            "10.20944/",  # Preprints.org
+            "10.48550/",  # arXiv
+        ]
+        return any(doi.startswith(prefix) for prefix in preprint_prefixes)
+    def _verify_doi_exists(self, doi: str) -> bool:
+        """Verify that a DOI exists and is accessible.
+        Args:
+            doi: DOI to verify
+        Returns:
+            True if DOI exists
+        """
+        try:
+            url = f"https://api.crossref.org/works/{doi}"
+            params = {"mailto": self.email}
+            resp = self.session.get(url, params=params, timeout=self.timeout, allow_redirects=False)
+            return bool(resp.status_code == 200)
+        except requests.RequestException:
+            return False
+    def mark_merged_citations(
+        self, citations: list[CitationRecord], merged_pairs: dict[str, str]
+    ) -> int:
+        """Mark citations as merged in place.
+        Args:
+            citations: List of citation records to update
+            merged_pairs: Dictionary mapping preprint DOI -> published DOI
+        Returns:
+            Number of citations marked as merged
+        """
+        marked_count = 0
+        for citation in citations:
+            if citation.citation_doi and citation.citation_doi in merged_pairs:
+                citation.citation_status = CitationStatus.merged
+                citation.citation_merged_into = merged_pairs[citation.citation_doi]
+                marked_count += 1
+                logger.info(
+                    f"Marked {citation.citation_doi} as merged into {citation.citation_merged_into}"
+                )
+        return marked_count
+    def fuzzy_match_by_title(
+        self,
+        citations: list[CitationRecord],
+        threshold: int = 90,
+    ) -> dict[str, str]:
+        """Find potential merges by fuzzy title matching (fallback method).
+        This is a heuristic approach for cases where CrossRef relationships
+        are not explicitly registered.
+        Args:
+            citations: List of citation records
+            threshold: Minimum similarity score (0-100) for matching
+        Returns:
+            Dictionary mapping preprint DOI -> published DOI candidates
+        """
+        potential_pairs: dict[str, str] = {}
+        preprints = [
+            c for c in citations if c.citation_doi and self._is_preprint_server(c.citation_doi)
+        ]
+        published = [
+            c for c in citations if c.citation_doi and not self._is_preprint_server(c.citation_doi)
+        ]
+        for preprint in preprints:
+            if not preprint.citation_title:
+                continue
+            best_match = None
+            best_score: float = 0.0
+            for pub in published:
+                if not pub.citation_title:
+                    continue
+                # Check if they have similar authors (if available)
+                # and similar publication years
+                # (This is a heuristic - use with caution)
+                score = fuzz.ratio(preprint.citation_title.lower(), pub.citation_title.lower())
+                if score > best_score and score >= threshold:
+                    best_score = score
+                    best_match = pub
+            if best_match and preprint.citation_doi and best_match.citation_doi:
+                logger.info(
+                    f"Fuzzy match found (score {best_score}): "
+                    f"{preprint.citation_doi} ~> {best_match.citation_doi}"
+                )
+                potential_pairs[preprint.citation_doi] = best_match.citation_doi
+        return potential_pairs

citations_collector/models/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Data models for citations-collector.
+Generated from LinkML schema at schema/citations.yaml.
+"""
+from __future__ import annotations
+from citations_collector.models.generated import (
+    CitationRecord,
+    CitationRelationship,
+    CitationSource,
+    CitationStatus,
+    CitationType,
+    Collection,
+    CurationConfig,
+    CurationRule,
+    DiscoverConfig,
+    Item,
+    ItemFlavor,
+    ItemRef,
+    PdfsConfig,
+    RefType,
+    SourceConfig,
+    ZoteroConfig,
+)
+__all__ = [
+    "CitationRecord",
+    "CitationRelationship",
+    "CitationSource",
+    "CitationStatus",
+    "CitationType",
+    "Collection",
+    "CurationConfig",
+    "CurationRule",
+    "DiscoverConfig",
+    "Item",
+    "ItemFlavor",
+    "ItemRef",
+    "PdfsConfig",
+    "RefType",
+    "SourceConfig",
+    "ZoteroConfig",
+]