litsync 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litsync/http.py ADDED
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import time
6
+ from html.parser import HTMLParser
7
+ from pathlib import Path
8
+ from typing import Callable, Optional
9
+
10
+ import requests
11
+ from requests.adapters import HTTPAdapter
12
+
13
+ from litsync import __version__
14
+ from litsync.config import Config
15
+
16
+ LOG = logging.getLogger("litsync")
17
+ CHUNK = 1 << 20
18
+
19
+
20
+ class _LinkParser(HTMLParser):
21
+ def __init__(self):
22
+ super().__init__()
23
+ self.hrefs: list[str] = []
24
+
25
+ def handle_starttag(self, tag, attrs):
26
+ if tag == "a":
27
+ for k, v in attrs:
28
+ if k == "href" and v:
29
+ self.hrefs.append(v)
30
+
31
+
32
+ class HttpClient:
33
+ def __init__(self, cfg: Config):
34
+ self.cfg = cfg
35
+ self.session = requests.Session()
36
+ adapter = HTTPAdapter(pool_connections=cfg.workers * 2, pool_maxsize=cfg.workers * 2)
37
+ self.session.mount("https://", adapter)
38
+ self.session.headers.update(
39
+ {
40
+ "User-Agent": f"litsync/{__version__} (mailto:{cfg.email}) python-requests",
41
+ "Accept-Encoding": "identity",
42
+ }
43
+ )
44
+
45
+ def _retry(self, fn, what: str):
46
+ last = None
47
+ for attempt in range(1, self.cfg.max_retries + 1):
48
+ try:
49
+ return fn()
50
+ except (requests.RequestException, OSError) as exc:
51
+ last = exc
52
+ wait = self.cfg.backoff_base ** attempt
53
+ LOG.warning("attempt %d/%d failed for %s: %s (retry in %.0fs)",
54
+ attempt, self.cfg.max_retries, what, exc, wait)
55
+ time.sleep(wait)
56
+ raise last
57
+
58
+ def list_dir(self, url: str) -> list[str]:
59
+ def _do():
60
+ r = self.session.get(url, timeout=self.cfg.timeout)
61
+ r.raise_for_status()
62
+ return r.text
63
+ html = self._retry(_do, f"list {url}")
64
+ parser = _LinkParser()
65
+ parser.feed(html)
66
+ names = []
67
+ for href in parser.hrefs:
68
+ if href.startswith("?") or href.startswith("/") or href.startswith(".."):
69
+ continue
70
+ href = href.split("?")[0].split("#")[0]
71
+ if not href or href.endswith("/"):
72
+ continue
73
+ names.append(href)
74
+ return names
75
+
76
+ def get_text(self, url: str) -> str:
77
+ def _do():
78
+ r = self.session.get(url, timeout=self.cfg.timeout)
79
+ r.raise_for_status()
80
+ return r.text
81
+ return self._retry(_do, f"get {url}")
82
+
83
+ def head(self, url: str) -> tuple[Optional[int], Optional[str], Optional[str]]:
84
+ def _do():
85
+ r = self.session.head(url, timeout=self.cfg.timeout, allow_redirects=True)
86
+ r.raise_for_status()
87
+ return r
88
+ r = self._retry(_do, f"head {url}")
89
+ size = int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
90
+ return size, r.headers.get("Last-Modified"), r.headers.get("ETag")
91
+
92
+ def download(
93
+ self,
94
+ url: str,
95
+ dest: Path,
96
+ expected_size: Optional[int],
97
+ progress_callback: Optional[Callable[[int], None]] = None,
98
+ ) -> int:
99
+ dest.parent.mkdir(parents=True, exist_ok=True)
100
+ part = dest.with_suffix(dest.suffix + ".part")
101
+ existing = part.stat().st_size if part.exists() else 0
102
+
103
+ def _do():
104
+ headers = {}
105
+ mode = "wb"
106
+ if existing and expected_size and existing < expected_size:
107
+ headers["Range"] = f"bytes={existing}-"
108
+ mode = "ab"
109
+ with self.session.get(url, stream=True, timeout=self.cfg.timeout,
110
+ headers=headers) as r:
111
+ if "Range" in headers and r.status_code == 200:
112
+ mode = "wb"
113
+ r.raise_for_status()
114
+ with open(part, mode) as fh:
115
+ for chunk in r.iter_content(CHUNK):
116
+ if chunk:
117
+ fh.write(chunk)
118
+ if progress_callback:
119
+ progress_callback(len(chunk))
120
+ return part.stat().st_size
121
+
122
+ written = self._retry(_do, f"download {url}")
123
+ if expected_size is not None and written != expected_size:
124
+ part.unlink(missing_ok=True)
125
+ raise IOError(f"size mismatch for {url}: got {written}, expected {expected_size}")
126
+ os.replace(part, dest)
127
+ return written
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ @dataclasses.dataclass
9
+ class Task:
10
+ source: str
11
+ filename: str
12
+ url: str
13
+ dest: Path
14
+ rel_path: str
15
+ md5_url: Optional[str] = None
16
+ immutable: bool = True
17
+ extract: bool = False
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ from litsync.config import Config
4
+ from litsync.http import HttpClient
5
+ from litsync.sources import Task
6
+
7
+ CLINICALTRIALS_XML_URL = "https://clinicaltrials.gov/api/legacy/public-xml?format=zip"
8
+
9
+
10
+ class ClinicalTrialsSource:
11
+ """ClinicalTrials.gov full public XML dump."""
12
+
13
+ def __init__(self, cfg: Config, http: HttpClient):
14
+ self.cfg, self.http = cfg, http
15
+
16
+ def plan(self) -> list[Task]:
17
+ name = "ctg-public-xml.zip"
18
+ rel = f"clinicaltrials/{name}"
19
+ return [Task(
20
+ source="clinicaltrials_xml",
21
+ filename=name,
22
+ url=CLINICALTRIALS_XML_URL,
23
+ dest=self.cfg.data_root / rel,
24
+ rel_path=rel,
25
+ md5_url=None,
26
+ immutable=True,
27
+ extract=True,
28
+ )]
litsync/sources/fda.py ADDED
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from litsync.config import Config
6
+ from litsync.http import HttpClient
7
+ from litsync.sources import Task
8
+
9
+ FDA_MANIFEST_URL = "https://api.fda.gov/download.json"
10
+
11
+
12
+ class FdaSource:
13
+ """openFDA bulk snapshots from the download manifest."""
14
+
15
+ def __init__(self, cfg: Config, http: HttpClient):
16
+ self.cfg, self.http = cfg, http
17
+
18
+ def _should_include(self, category: str, endpoint: str) -> bool:
19
+ if not self.cfg.fda_endpoints:
20
+ return True
21
+ return f"{category}/{endpoint}" in self.cfg.fda_endpoints
22
+
23
+ def plan(self) -> list[Task]:
24
+ text = self.http.get_text(FDA_MANIFEST_URL)
25
+ manifest = json.loads(text)
26
+ tasks: list[Task] = []
27
+ for category, endpoints in manifest.get("results", {}).items():
28
+ for endpoint, info in endpoints.items():
29
+ if not self._should_include(category, endpoint):
30
+ continue
31
+ source = f"fda_{category}_{endpoint}"
32
+ for part in info.get("partitions", []):
33
+ url = part.get("file")
34
+ if not url:
35
+ continue
36
+ name = url.rsplit("/", 1)[-1]
37
+ rel = f"fda/{category}/{endpoint}/{name}"
38
+ tasks.append(Task(
39
+ source=source,
40
+ filename=name,
41
+ url=url,
42
+ dest=self.cfg.data_root / rel,
43
+ rel_path=rel,
44
+ md5_url=None,
45
+ immutable=True,
46
+ extract=True,
47
+ ))
48
+ return tasks
litsync/sources/pmc.py ADDED
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from litsync.config import Config
4
+ from litsync.http import HttpClient
5
+ from litsync.sources import Task
6
+
7
+ PMC_BASE = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/deprecated"
8
+
9
+
10
+ class PmcSource:
11
+ """PMC OA bulk: baseline + daily incremental .tar.gz per group/format."""
12
+
13
+ def __init__(self, cfg: Config, http: HttpClient):
14
+ self.cfg, self.http = cfg, http
15
+
16
+ def _plan_bulk_dir(self, group: str, fmt: str) -> list[Task]:
17
+ source = f"pmc_{group}_{fmt}"
18
+ url = f"{PMC_BASE}/oa_bulk/{group}/{fmt}/"
19
+ names = self.http.list_dir(url)
20
+ tasks = []
21
+ for name in sorted(names):
22
+ if not (name.endswith(".tar.gz") or name.endswith(".filelist.csv")
23
+ or name.endswith(".filelist.txt")):
24
+ continue
25
+ rel = f"pmc/oa_bulk/{group}/{fmt}/{name}"
26
+ tasks.append(Task(
27
+ source=source,
28
+ filename=name,
29
+ url=url + name,
30
+ dest=self.cfg.data_root / rel,
31
+ rel_path=rel,
32
+ md5_url=None,
33
+ immutable=True,
34
+ ))
35
+ return tasks
36
+
37
+ def plan(self) -> list[Task]:
38
+ tasks: list[Task] = []
39
+ for group in self.cfg.pmc_groups:
40
+ for fmt in self.cfg.pmc_formats:
41
+ tasks.extend(self._plan_bulk_dir(group, fmt))
42
+ tasks.append(Task(
43
+ source="pmc_idmap",
44
+ filename="oa_file_list.csv",
45
+ url=f"{PMC_BASE}/oa_file_list.csv",
46
+ dest=self.cfg.data_root / "pmc" / "oa_file_list.csv",
47
+ rel_path="pmc/oa_file_list.csv",
48
+ md5_url=None,
49
+ immutable=False,
50
+ ))
51
+ return tasks
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from litsync.config import Config
4
+ from litsync.http import HttpClient
5
+ from litsync.sources import Task
6
+
7
+ PUBMED_BASE = "https://ftp.ncbi.nlm.nih.gov/pubmed"
8
+
9
+
10
+ class PubMedSource:
11
+ """PubMed baseline + daily update files. Files are immutable and have .md5 sidecars."""
12
+
13
+ SUFFIX = ".xml.gz"
14
+
15
+ def __init__(self, cfg: Config, http: HttpClient):
16
+ self.cfg, self.http = cfg, http
17
+
18
+ def _plan_dir(self, subdir: str) -> list[Task]:
19
+ source = f"pubmed_{subdir}"
20
+ url = f"{PUBMED_BASE}/{subdir}/"
21
+ names = [n for n in self.http.list_dir(url) if n.endswith(self.SUFFIX)]
22
+ tasks = []
23
+ for name in sorted(names):
24
+ rel = f"pubmed/{subdir}/{name}"
25
+ tasks.append(Task(
26
+ source=source,
27
+ filename=name,
28
+ url=url + name,
29
+ dest=self.cfg.data_root / rel,
30
+ rel_path=rel,
31
+ md5_url=f"{url}{name}.md5",
32
+ immutable=True,
33
+ ))
34
+ return tasks
35
+
36
+ def plan(self) -> list[Task]:
37
+ return self._plan_dir("baseline") + self._plan_dir("updatefiles")
litsync/state.py ADDED
@@ -0,0 +1,147 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import dataclasses
5
+ import sqlite3
6
+ import threading
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from litsync.utils import utcnow
11
+
12
+
13
+ @dataclasses.dataclass
14
+ class FileRecord:
15
+ source: str
16
+ filename: str
17
+ url: str
18
+ rel_path: str
19
+ remote_size: Optional[int] = None
20
+ remote_mtime: Optional[str] = None
21
+ etag: Optional[str] = None
22
+ md5: Optional[str] = None
23
+ local_md5: Optional[str] = None
24
+ status: str = "pending"
25
+ attempts: int = 0
26
+ error: Optional[str] = None
27
+
28
+
29
+ class StateDB:
30
+ """Thread-safe-enough SQLite wrapper (single connection guarded by a lock)."""
31
+
32
+ def __init__(self, path: Path):
33
+ path.parent.mkdir(parents=True, exist_ok=True)
34
+ self._lock = threading.Lock()
35
+ self._conn = sqlite3.connect(str(path), check_same_thread=False, timeout=30)
36
+ self._conn.row_factory = sqlite3.Row
37
+ with self._lock:
38
+ self._conn.execute("PRAGMA journal_mode=WAL")
39
+ self._conn.execute("PRAGMA synchronous=NORMAL")
40
+ self._conn.execute(
41
+ """
42
+ CREATE TABLE IF NOT EXISTS files (
43
+ source TEXT NOT NULL,
44
+ filename TEXT NOT NULL,
45
+ url TEXT NOT NULL,
46
+ rel_path TEXT NOT NULL,
47
+ remote_size INTEGER,
48
+ remote_mtime TEXT,
49
+ etag TEXT,
50
+ md5 TEXT,
51
+ local_md5 TEXT,
52
+ status TEXT NOT NULL DEFAULT 'pending',
53
+ attempts INTEGER NOT NULL DEFAULT 0,
54
+ error TEXT,
55
+ article_count INTEGER,
56
+ first_seen TEXT NOT NULL,
57
+ last_checked TEXT NOT NULL,
58
+ completed_at TEXT,
59
+ PRIMARY KEY (source, filename)
60
+ )
61
+ """
62
+ )
63
+ with contextlib.suppress(sqlite3.OperationalError):
64
+ self._conn.execute("ALTER TABLE files ADD COLUMN article_count INTEGER")
65
+ self._conn.commit()
66
+
67
+ def get(self, source: str, filename: str) -> Optional[sqlite3.Row]:
68
+ with self._lock:
69
+ cur = self._conn.execute(
70
+ "SELECT * FROM files WHERE source=? AND filename=?", (source, filename)
71
+ )
72
+ return cur.fetchone()
73
+
74
+ def all_sources(self) -> set[str]:
75
+ with self._lock:
76
+ cur = self._conn.execute("SELECT DISTINCT source FROM files")
77
+ return {r["source"] for r in cur.fetchall()}
78
+
79
+ def known_filenames(self, source: str) -> set[str]:
80
+ with self._lock:
81
+ cur = self._conn.execute(
82
+ "SELECT filename FROM files WHERE source=?", (source,)
83
+ )
84
+ return {r["filename"] for r in cur.fetchall()}
85
+
86
+ def upsert_seen(self, rec: FileRecord) -> None:
87
+ now = utcnow()
88
+ with self._lock:
89
+ self._conn.execute(
90
+ """
91
+ INSERT INTO files (source, filename, url, rel_path, first_seen, last_checked)
92
+ VALUES (?,?,?,?,?,?)
93
+ ON CONFLICT(source, filename) DO UPDATE SET
94
+ url=excluded.url, rel_path=excluded.rel_path, last_checked=excluded.last_checked
95
+ """,
96
+ (rec.source, rec.filename, rec.url, rec.rel_path, now, now),
97
+ )
98
+ self._conn.commit()
99
+
100
+ def mark(self, source: str, filename: str, **fields) -> None:
101
+ if not fields:
102
+ return
103
+ fields["last_checked"] = utcnow()
104
+ if fields.get("status") in ("done", "verified"):
105
+ fields["completed_at"] = utcnow()
106
+ cols = ", ".join(f"{k}=?" for k in fields)
107
+ vals = list(fields.values()) + [source, filename]
108
+ with self._lock:
109
+ self._conn.execute(
110
+ f"UPDATE files SET {cols} WHERE source=? AND filename=?", vals
111
+ )
112
+ self._conn.commit()
113
+
114
+ def summary(self) -> dict[str, int]:
115
+ with self._lock:
116
+ cur = self._conn.execute("SELECT status, COUNT(*) c FROM files GROUP BY status")
117
+ return {r["status"]: r["c"] for r in cur.fetchall()}
118
+
119
+ def summary_by_source(self) -> dict[str, dict]:
120
+ with self._lock:
121
+ cur = self._conn.execute(
122
+ "SELECT source, COUNT(*) c, COALESCE(SUM(remote_size),0) bytes, "
123
+ "COALESCE(SUM(article_count),0) articles, "
124
+ "SUM(article_count IS NOT NULL) counted, "
125
+ "SUM(status='verified') verified, SUM(status='failed') failed "
126
+ "FROM files GROUP BY source"
127
+ )
128
+ return {
129
+ r["source"]: {
130
+ "files": r["c"], "bytes": r["bytes"],
131
+ "articles": r["articles"], "counted": r["counted"] or 0,
132
+ "verified": r["verified"] or 0, "failed": r["failed"] or 0,
133
+ }
134
+ for r in cur.fetchall()
135
+ }
136
+
137
+ def files_missing_counts(self) -> list[tuple[str, str, str]]:
138
+ with self._lock:
139
+ cur = self._conn.execute(
140
+ "SELECT source, filename, rel_path FROM files "
141
+ "WHERE article_count IS NULL AND status='verified'"
142
+ )
143
+ return [(r["source"], r["filename"], r["rel_path"]) for r in cur.fetchall()]
144
+
145
+ def close(self) -> None:
146
+ with self._lock:
147
+ self._conn.close()