citations-collector 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ """PDF acquisition and management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import subprocess
7
+ import time
8
+ from collections.abc import Mapping
9
+ from pathlib import Path
10
+
11
+ import requests
12
+ from requests.adapters import HTTPAdapter
13
+ from requests.models import PreparedRequest, Response
14
+ from urllib3.util.retry import Retry
15
+
16
+ from citations_collector.models import CitationRecord
17
+ from citations_collector.unpaywall import UnpaywallClient
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class RetryAfterAdapter(HTTPAdapter):
23
+ """HTTPAdapter that respects Retry-After header from server."""
24
+
25
+ def send(
26
+ self,
27
+ request: PreparedRequest,
28
+ stream: bool = False,
29
+ timeout: float | tuple[float, float] | tuple[float, None] | None = None,
30
+ verify: bool | str = True,
31
+ cert: bytes | str | tuple[bytes | str, bytes | str] | None = None,
32
+ proxies: Mapping[str, str] | None = None,
33
+ ) -> Response:
34
+ """Send request with Retry-After header support."""
35
+ response = super().send(
36
+ request,
37
+ stream=stream,
38
+ timeout=timeout,
39
+ verify=verify,
40
+ cert=cert,
41
+ proxies=proxies,
42
+ )
43
+
44
+ # Check for Retry-After header on 429/503 responses
45
+ if response.status_code in (429, 503):
46
+ retry_after = response.headers.get("Retry-After")
47
+ if retry_after:
48
+ try:
49
+ # Retry-After can be seconds (int) or HTTP date
50
+ delay = int(retry_after)
51
+ logger.warning(
52
+ f"Rate limited by {request.url}, waiting {delay}s (Retry-After header)"
53
+ )
54
+ time.sleep(delay)
55
+ except ValueError:
56
+ # HTTP date format - default to 60s
57
+ logger.warning(f"Rate limited by {request.url}, waiting 60s")
58
+ time.sleep(60)
59
+
60
+ return response
61
+
62
+
63
+ class PDFAcquirer:
64
+ def __init__(
65
+ self,
66
+ output_dir: Path = Path("pdfs"),
67
+ email: str = "site-unpaywall@oneukrainian.com",
68
+ git_annex: bool = False,
69
+ ) -> None:
70
+ self.output_dir = Path(output_dir)
71
+ self.unpaywall = UnpaywallClient(email=email)
72
+ self.git_annex = git_annex
73
+
74
+ # Create session with retry logic and proper User-Agent
75
+ self.session = requests.Session()
76
+ self.session.headers.update(
77
+ {
78
+ "User-Agent": f"citations-collector/0.2 (mailto:{email})",
79
+ "Accept": "application/pdf,*/*",
80
+ }
81
+ )
82
+
83
+ # Retry on 403, 429, 500, 502, 503, 504 with exponential backoff
84
+ # Longer backoff for bioRxiv/Cloudflare protection: 2s, 6s, 18s, 54s
85
+ retry_strategy = Retry(
86
+ total=4,
87
+ backoff_factor=3, # 3^0=1s, 3^1=3s, 3^2=9s, 3^3=27s (with backoff_factor multiplier)
88
+ status_forcelist=[403, 429, 500, 502, 503, 504],
89
+ allowed_methods=["GET", "HEAD"],
90
+ respect_retry_after_header=True, # Respect Retry-After from server
91
+ )
92
+ adapter = RetryAfterAdapter(max_retries=retry_strategy)
93
+ self.session.mount("http://", adapter)
94
+ self.session.mount("https://", adapter)
95
+
96
+ # Rate limiting: delay between downloads to avoid triggering Cloudflare
97
+ self._last_download_time = 0.0
98
+ self._download_delay = 2.0 # 2 seconds between downloads
99
+
100
+ def acquire_for_citation(self, citation: CitationRecord, dry_run: bool = False) -> bool:
101
+ """Look up OA status, download PDF if available. Returns True if PDF was acquired."""
102
+ if not citation.citation_doi:
103
+ return False
104
+
105
+ result = self.unpaywall.lookup(citation.citation_doi)
106
+ citation.oa_status = result.oa_status
107
+ citation.pdf_url = result.best_oa_url
108
+
109
+ if not result.best_oa_url or not result.is_oa:
110
+ return False
111
+
112
+ pdf_path = self._doi_to_path(citation.citation_doi)
113
+
114
+ if dry_run:
115
+ logger.info("Would download %s -> %s", citation.citation_doi, pdf_path)
116
+ return False
117
+
118
+ # Skip if already downloaded (check both .pdf and .html extensions)
119
+ full_path = self.output_dir / pdf_path
120
+ html_path = full_path.with_suffix(".html")
121
+
122
+ if full_path.exists():
123
+ citation.pdf_path = str(full_path)
124
+ logger.debug(f"PDF already exists: {full_path}")
125
+ return False
126
+ if html_path.exists():
127
+ citation.pdf_path = str(html_path)
128
+ logger.debug(f"HTML already exists: {html_path}")
129
+ return False
130
+
131
+ # Download PDF (or HTML if server returns that)
132
+ actual_path = self._download(result.best_oa_url, full_path)
133
+ if actual_path:
134
+ citation.pdf_path = str(actual_path)
135
+ # Also fetch BibTeX
136
+ self._fetch_bibtex(citation.citation_doi, actual_path.parent / "article.bib")
137
+ # git-annex
138
+ if self.git_annex:
139
+ self._annex_addurl(actual_path, result.best_oa_url)
140
+ return True
141
+ return False
142
+
143
+ def acquire_all(
144
+ self,
145
+ citations: list[CitationRecord],
146
+ dry_run: bool = False,
147
+ ) -> dict[str, int]:
148
+ """Process all citations. Returns counts dict."""
149
+ counts = {"downloaded": 0, "skipped": 0, "no_oa": 0, "no_doi": 0, "error": 0}
150
+ seen_dois: set[str] = set()
151
+
152
+ for citation in citations:
153
+ if not citation.citation_doi:
154
+ counts["no_doi"] += 1
155
+ continue
156
+ if citation.citation_doi in seen_dois:
157
+ # Copy fields from first citation with same DOI
158
+ for prev in citations:
159
+ if prev.citation_doi == citation.citation_doi and prev.oa_status:
160
+ citation.oa_status = prev.oa_status
161
+ citation.pdf_url = prev.pdf_url
162
+ citation.pdf_path = prev.pdf_path
163
+ break
164
+ counts["skipped"] += 1
165
+ continue
166
+ seen_dois.add(citation.citation_doi)
167
+
168
+ if citation.pdf_path and Path(citation.pdf_path).exists():
169
+ counts["skipped"] += 1
170
+ continue
171
+
172
+ try:
173
+ if self.acquire_for_citation(citation, dry_run=dry_run):
174
+ counts["downloaded"] += 1
175
+ elif citation.oa_status == "closed" or not citation.pdf_url:
176
+ counts["no_oa"] += 1
177
+ else:
178
+ counts["skipped"] += 1
179
+ except Exception:
180
+ logger.exception("Error acquiring PDF for %s", citation.citation_doi)
181
+ counts["error"] += 1
182
+
183
+ return counts
184
+
185
+ def _doi_to_path(self, doi: str) -> Path:
186
+ """Convert DOI to relative path: 10.1038/s41597-023-02214-y -> 10.1038/.../article.pdf"""
187
+ return Path(doi) / "article.pdf"
188
+
189
+ def _download(self, url: str, dest: Path) -> Path | None:
190
+ """
191
+ Download URL to dest with retry logic and content-type detection.
192
+
193
+ If server returns HTML instead of PDF, saves with .html extension.
194
+ Returns actual path on success, None on failure.
195
+ """
196
+ # Rate limiting: wait between downloads to avoid triggering Cloudflare
197
+ elapsed = time.time() - self._last_download_time
198
+ if elapsed < self._download_delay:
199
+ time.sleep(self._download_delay - elapsed)
200
+
201
+ dest.parent.mkdir(parents=True, exist_ok=True)
202
+ try:
203
+ self._last_download_time = time.time()
204
+ resp = self.session.get(url, timeout=60, stream=True)
205
+ resp.raise_for_status()
206
+
207
+ # Check Content-Type to detect HTML vs PDF
208
+ content_type = resp.headers.get("Content-Type", "").lower()
209
+ is_html = any(
210
+ html_type in content_type
211
+ for html_type in ["text/html", "application/xhtml+xml", "text/xml"]
212
+ )
213
+
214
+ # If HTML detected, change extension
215
+ if is_html:
216
+ dest = dest.with_suffix(".html")
217
+ logger.warning(
218
+ "Server returned HTML instead of PDF for %s, saving as %s",
219
+ url,
220
+ dest.name,
221
+ )
222
+
223
+ with open(dest, "wb") as f:
224
+ for chunk in resp.iter_content(chunk_size=8192):
225
+ f.write(chunk)
226
+ logger.info("Downloaded %s", dest)
227
+ return dest
228
+ except requests.RequestException as e:
229
+ logger.warning("Download failed for %s: %s", url, e)
230
+ if dest.exists():
231
+ dest.unlink()
232
+ return None
233
+
234
+ def _fetch_bibtex(self, doi: str, dest: Path) -> None:
235
+ """Fetch BibTeX via DOI content negotiation."""
236
+ try:
237
+ resp = requests.get(
238
+ f"https://doi.org/{doi}",
239
+ headers={"Accept": "application/x-bibtex"},
240
+ timeout=30,
241
+ allow_redirects=True,
242
+ )
243
+ if resp.status_code == 200 and resp.text.strip():
244
+ dest.write_text(resp.text)
245
+ logger.info("Saved BibTeX to %s", dest)
246
+ except requests.RequestException as e:
247
+ logger.warning("BibTeX fetch failed for %s: %s", doi, e)
248
+
249
+ def _annex_addurl(self, path: Path, url: str) -> None:
250
+ """Register URL with git-annex."""
251
+ try:
252
+ subprocess.run(
253
+ ["git", "annex", "addurl", "--file", str(path), url],
254
+ check=True,
255
+ capture_output=True,
256
+ text=True,
257
+ )
258
+ logger.info("git annex addurl for %s", path)
259
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
260
+ logger.warning("git annex addurl failed for %s: %s", path, e)
@@ -0,0 +1,7 @@
1
+ """Persistence layer for loading/saving collections and citations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from citations_collector.persistence import tsv_io, yaml_io
6
+
7
+ __all__ = ["yaml_io", "tsv_io"]
@@ -0,0 +1,121 @@
1
+ """Load and save TSV citation files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from contextlib import suppress
7
+ from pathlib import Path
8
+
9
+ from citations_collector.models import CitationRecord
10
+
11
+ # TSV column order matching examples/citations-example.tsv
12
+ TSV_COLUMNS = [
13
+ "item_id",
14
+ "item_flavor",
15
+ "item_ref_type",
16
+ "item_ref_value",
17
+ "item_name",
18
+ "citation_doi",
19
+ "citation_pmid",
20
+ "citation_arxiv",
21
+ "citation_url",
22
+ "citation_title",
23
+ "citation_authors",
24
+ "citation_year",
25
+ "citation_journal",
26
+ "citation_relationship",
27
+ "citation_type",
28
+ "citation_sources", # Plural - can contain comma-separated values
29
+ "discovered_date",
30
+ "citation_status",
31
+ "citation_merged_into",
32
+ "citation_comment",
33
+ "curated_by",
34
+ "curated_date",
35
+ "oa_status",
36
+ "pdf_url",
37
+ "pdf_path",
38
+ ]
39
+
40
+
41
+ def load_citations(path: Path) -> list[CitationRecord]:
42
+ """
43
+ Load citations from TSV file.
44
+
45
+ Args:
46
+ path: Path to TSV file
47
+
48
+ Returns:
49
+ List of CitationRecord objects
50
+
51
+ Raises:
52
+ FileNotFoundError: If file doesn't exist
53
+ """
54
+ citations = []
55
+
56
+ with open(path, newline="") as f:
57
+ reader = csv.DictReader(f, delimiter="\t")
58
+
59
+ for row in reader:
60
+ # Remove empty string values (treat as None)
61
+ cleaned = {k: (v if v != "" else None) for k, v in row.items()}
62
+
63
+ # Convert year to int if present
64
+ if cleaned.get("citation_year"):
65
+ with suppress(ValueError):
66
+ cleaned["citation_year"] = int(cleaned["citation_year"]) # type: ignore[arg-type]
67
+
68
+ # Parse citation_sources from TSV (comma-separated)
69
+ # Support both old "citation_source" and new "citation_sources" columns
70
+ sources_field = cleaned.get("citation_sources") or cleaned.get("citation_source")
71
+ if sources_field and "," in str(sources_field):
72
+ # Multiple sources - parse into list
73
+ sources = [s.strip() for s in sources_field.split(",")]
74
+ cleaned["citation_sources"] = sources
75
+ # Set citation_source to first (required field, backward compat)
76
+ cleaned["citation_source"] = sources[0]
77
+ elif sources_field:
78
+ # Single source - still create list for consistency
79
+ cleaned["citation_sources"] = [sources_field]
80
+ cleaned["citation_source"] = sources_field
81
+ else:
82
+ # No source field - set default for backward compatibility
83
+ # This can happen with old TSV files or test data
84
+ # Use "manual" as it's the appropriate enum value for unspecified sources
85
+ cleaned["citation_source"] = "manual"
86
+ cleaned["citation_sources"] = ["manual"]
87
+
88
+ # Create CitationRecord, only including fields that are in the model
89
+ citation = CitationRecord(**cleaned) # type: ignore[arg-type]
90
+ citations.append(citation)
91
+
92
+ return citations
93
+
94
+
95
+ def save_citations(citations: list[CitationRecord], path: Path) -> None:
96
+ """
97
+ Save citations to TSV file.
98
+
99
+ Args:
100
+ citations: List of CitationRecord objects
101
+ path: Path to output TSV file
102
+ """
103
+ with open(path, "w", newline="") as f:
104
+ writer = csv.DictWriter(f, fieldnames=TSV_COLUMNS, delimiter="\t", extrasaction="ignore")
105
+ writer.writeheader()
106
+
107
+ for citation in citations:
108
+ # Convert to dict
109
+ data = citation.model_dump(exclude_none=False, mode="python")
110
+
111
+ # Serialize citation_sources list to comma-separated string
112
+ if "citation_sources" in data and data["citation_sources"]:
113
+ data["citation_sources"] = ", ".join(data["citation_sources"])
114
+ # Remove citation_source (singular, deprecated field) from output
115
+ if "citation_source" in data:
116
+ del data["citation_source"]
117
+
118
+ # Convert None to empty string for TSV
119
+ cleaned = {k: ("" if v is None else str(v)) for k, v in data.items()}
120
+
121
+ writer.writerow(cleaned)
@@ -0,0 +1,50 @@
1
+ """Load and save YAML collection files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+
9
+ from citations_collector.models import Collection
10
+
11
+
12
+ def load_collection(path: Path) -> Collection:
13
+ """
14
+ Load collection from YAML file.
15
+
16
+ Args:
17
+ path: Path to YAML file
18
+
19
+ Returns:
20
+ Collection object
21
+
22
+ Raises:
23
+ FileNotFoundError: If file doesn't exist
24
+ ValidationError: If YAML doesn't match schema
25
+ """
26
+ with open(path) as f:
27
+ data = yaml.safe_load(f)
28
+
29
+ return Collection(**data)
30
+
31
+
32
+ def save_collection(collection: Collection, path: Path) -> None:
33
+ """
34
+ Save collection to YAML file.
35
+
36
+ Args:
37
+ collection: Collection object to save
38
+ path: Path to output YAML file
39
+ """
40
+ # Convert to dict, excluding None values for cleaner output
41
+ data = collection.model_dump(exclude_none=True, mode="python")
42
+
43
+ with open(path, "w") as f:
44
+ yaml.safe_dump(
45
+ data,
46
+ f,
47
+ default_flow_style=False,
48
+ sort_keys=False,
49
+ allow_unicode=True,
50
+ )
File without changes
@@ -0,0 +1,60 @@
1
+ """Unpaywall API client for open access PDF discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from dataclasses import dataclass
8
+
9
+ import requests
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class UnpaywallResult:
16
+ doi: str
17
+ is_oa: bool
18
+ oa_status: str # gold/green/bronze/hybrid/closed
19
+ best_oa_url: str | None # direct PDF URL
20
+ license: str | None
21
+
22
+
23
+ class UnpaywallClient:
24
+ BASE_URL = "https://api.unpaywall.org/v2/"
25
+
26
+ def __init__(self, email: str = "site-unpaywall@oneukrainian.com") -> None:
27
+ self.email = email
28
+ self._last_request_time = 0.0
29
+
30
+ def lookup(self, doi: str) -> UnpaywallResult:
31
+ """Look up OA status and PDF URL for a DOI."""
32
+ self._rate_limit()
33
+ url = f"{self.BASE_URL}{doi}"
34
+ try:
35
+ resp = requests.get(url, params={"email": self.email}, timeout=30)
36
+ if resp.status_code == 404:
37
+ return UnpaywallResult(
38
+ doi=doi, is_oa=False, oa_status="closed", best_oa_url=None, license=None
39
+ )
40
+ resp.raise_for_status()
41
+ data = resp.json()
42
+ best_loc = data.get("best_oa_location") or {}
43
+ return UnpaywallResult(
44
+ doi=doi,
45
+ is_oa=data.get("is_oa", False),
46
+ oa_status=data.get("oa_status", "closed") or "closed",
47
+ best_oa_url=best_loc.get("url_for_pdf") or best_loc.get("url"),
48
+ license=best_loc.get("license"),
49
+ )
50
+ except requests.RequestException as e:
51
+ logger.warning("Unpaywall lookup failed for %s: %s", doi, e)
52
+ return UnpaywallResult(
53
+ doi=doi, is_oa=False, oa_status="closed", best_oa_url=None, license=None
54
+ )
55
+
56
+ def _rate_limit(self) -> None:
57
+ elapsed = time.monotonic() - self._last_request_time
58
+ if elapsed < 0.1:
59
+ time.sleep(0.1 - elapsed)
60
+ self._last_request_time = time.monotonic()