PyPI - litsync - Versions diffs - 0.0.2__py3-none-any.whl - Mend

litsync 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

litsync/__init__.py +3 -0
litsync/__main__.py +3 -0
litsync/cli.py +143 -0
litsync/config.py +42 -0
litsync/extract.py +409 -0
litsync/http.py +127 -0
litsync/sources/__init__.py +17 -0
litsync/sources/clinicaltrials.py +28 -0
litsync/sources/fda.py +48 -0
litsync/sources/pmc.py +51 -0
litsync/sources/pubmed.py +37 -0
litsync/state.py +147 -0
litsync/sync.py +334 -0
litsync/ui.py +232 -0
litsync/utils.py +122 -0
litsync-0.0.2.dist-info/METADATA +125 -0
litsync-0.0.2.dist-info/RECORD +20 -0
litsync-0.0.2.dist-info/WHEEL +5 -0
litsync-0.0.2.dist-info/entry_points.txt +3 -0
litsync-0.0.2.dist-info/top_level.txt +1 -0

litsync/http.py ADDED Viewed

@@ -0,0 +1,127 @@
+from __future__ import annotations
+import logging
+import os
+import time
+from html.parser import HTMLParser
+from pathlib import Path
+from typing import Callable, Optional
+import requests
+from requests.adapters import HTTPAdapter
+from litsync import __version__
+from litsync.config import Config
+LOG = logging.getLogger("litsync")
+CHUNK = 1 << 20
+class _LinkParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.hrefs: list[str] = []
+    def handle_starttag(self, tag, attrs):
+        if tag == "a":
+            for k, v in attrs:
+                if k == "href" and v:
+                    self.hrefs.append(v)
+class HttpClient:
+    def __init__(self, cfg: Config):
+        self.cfg = cfg
+        self.session = requests.Session()
+        adapter = HTTPAdapter(pool_connections=cfg.workers * 2, pool_maxsize=cfg.workers * 2)
+        self.session.mount("https://", adapter)
+        self.session.headers.update(
+            {
+                "User-Agent": f"litsync/{__version__} (mailto:{cfg.email}) python-requests",
+                "Accept-Encoding": "identity",
+            }
+        )
+    def _retry(self, fn, what: str):
+        last = None
+        for attempt in range(1, self.cfg.max_retries + 1):
+            try:
+                return fn()
+            except (requests.RequestException, OSError) as exc:
+                last = exc
+                wait = self.cfg.backoff_base ** attempt
+                LOG.warning("attempt %d/%d failed for %s: %s (retry in %.0fs)",
+                            attempt, self.cfg.max_retries, what, exc, wait)
+                time.sleep(wait)
+        raise last
+    def list_dir(self, url: str) -> list[str]:
+        def _do():
+            r = self.session.get(url, timeout=self.cfg.timeout)
+            r.raise_for_status()
+            return r.text
+        html = self._retry(_do, f"list {url}")
+        parser = _LinkParser()
+        parser.feed(html)
+        names = []
+        for href in parser.hrefs:
+            if href.startswith("?") or href.startswith("/") or href.startswith(".."):
+                continue
+            href = href.split("?")[0].split("#")[0]
+            if not href or href.endswith("/"):
+                continue
+            names.append(href)
+        return names
+    def get_text(self, url: str) -> str:
+        def _do():
+            r = self.session.get(url, timeout=self.cfg.timeout)
+            r.raise_for_status()
+            return r.text
+        return self._retry(_do, f"get {url}")
+    def head(self, url: str) -> tuple[Optional[int], Optional[str], Optional[str]]:
+        def _do():
+            r = self.session.head(url, timeout=self.cfg.timeout, allow_redirects=True)
+            r.raise_for_status()
+            return r
+        r = self._retry(_do, f"head {url}")
+        size = int(r.headers["Content-Length"]) if "Content-Length" in r.headers else None
+        return size, r.headers.get("Last-Modified"), r.headers.get("ETag")
+    def download(
+        self,
+        url: str,
+        dest: Path,
+        expected_size: Optional[int],
+        progress_callback: Optional[Callable[[int], None]] = None,
+    ) -> int:
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        part = dest.with_suffix(dest.suffix + ".part")
+        existing = part.stat().st_size if part.exists() else 0
+        def _do():
+            headers = {}
+            mode = "wb"
+            if existing and expected_size and existing < expected_size:
+                headers["Range"] = f"bytes={existing}-"
+                mode = "ab"
+            with self.session.get(url, stream=True, timeout=self.cfg.timeout,
+                                  headers=headers) as r:
+                if "Range" in headers and r.status_code == 200:
+                    mode = "wb"
+                r.raise_for_status()
+                with open(part, mode) as fh:
+                    for chunk in r.iter_content(CHUNK):
+                        if chunk:
+                            fh.write(chunk)
+                            if progress_callback:
+                                progress_callback(len(chunk))
+            return part.stat().st_size
+        written = self._retry(_do, f"download {url}")
+        if expected_size is not None and written != expected_size:
+            part.unlink(missing_ok=True)
+            raise IOError(f"size mismatch for {url}: got {written}, expected {expected_size}")
+        os.replace(part, dest)
+        return written

litsync/sources/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from __future__ import annotations
+import dataclasses
+from pathlib import Path
+from typing import Optional
+@dataclasses.dataclass
+class Task:
+    source: str
+    filename: str
+    url: str
+    dest: Path
+    rel_path: str
+    md5_url: Optional[str] = None
+    immutable: bool = True
+    extract: bool = False

litsync/sources/clinicaltrials.py ADDED Viewed

@@ -0,0 +1,28 @@
+from __future__ import annotations
+from litsync.config import Config
+from litsync.http import HttpClient
+from litsync.sources import Task
+CLINICALTRIALS_XML_URL = "https://clinicaltrials.gov/api/legacy/public-xml?format=zip"
+class ClinicalTrialsSource:
+    """ClinicalTrials.gov full public XML dump."""
+    def __init__(self, cfg: Config, http: HttpClient):
+        self.cfg, self.http = cfg, http
+    def plan(self) -> list[Task]:
+        name = "ctg-public-xml.zip"
+        rel = f"clinicaltrials/{name}"
+        return [Task(
+            source="clinicaltrials_xml",
+            filename=name,
+            url=CLINICALTRIALS_XML_URL,
+            dest=self.cfg.data_root / rel,
+            rel_path=rel,
+            md5_url=None,
+            immutable=True,
+            extract=True,
+        )]

litsync/sources/fda.py ADDED Viewed

@@ -0,0 +1,48 @@
+from __future__ import annotations
+import json
+from litsync.config import Config
+from litsync.http import HttpClient
+from litsync.sources import Task
+FDA_MANIFEST_URL = "https://api.fda.gov/download.json"
+class FdaSource:
+    """openFDA bulk snapshots from the download manifest."""
+    def __init__(self, cfg: Config, http: HttpClient):
+        self.cfg, self.http = cfg, http
+    def _should_include(self, category: str, endpoint: str) -> bool:
+        if not self.cfg.fda_endpoints:
+            return True
+        return f"{category}/{endpoint}" in self.cfg.fda_endpoints
+    def plan(self) -> list[Task]:
+        text = self.http.get_text(FDA_MANIFEST_URL)
+        manifest = json.loads(text)
+        tasks: list[Task] = []
+        for category, endpoints in manifest.get("results", {}).items():
+            for endpoint, info in endpoints.items():
+                if not self._should_include(category, endpoint):
+                    continue
+                source = f"fda_{category}_{endpoint}"
+                for part in info.get("partitions", []):
+                    url = part.get("file")
+                    if not url:
+                        continue
+                    name = url.rsplit("/", 1)[-1]
+                    rel = f"fda/{category}/{endpoint}/{name}"
+                    tasks.append(Task(
+                        source=source,
+                        filename=name,
+                        url=url,
+                        dest=self.cfg.data_root / rel,
+                        rel_path=rel,
+                        md5_url=None,
+                        immutable=True,
+                        extract=True,
+                    ))
+        return tasks

litsync/sources/pmc.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+from litsync.config import Config
+from litsync.http import HttpClient
+from litsync.sources import Task
+PMC_BASE = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/deprecated"
+class PmcSource:
+    """PMC OA bulk: baseline + daily incremental .tar.gz per group/format."""
+    def __init__(self, cfg: Config, http: HttpClient):
+        self.cfg, self.http = cfg, http
+    def _plan_bulk_dir(self, group: str, fmt: str) -> list[Task]:
+        source = f"pmc_{group}_{fmt}"
+        url = f"{PMC_BASE}/oa_bulk/{group}/{fmt}/"
+        names = self.http.list_dir(url)
+        tasks = []
+        for name in sorted(names):
+            if not (name.endswith(".tar.gz") or name.endswith(".filelist.csv")
+                    or name.endswith(".filelist.txt")):
+                continue
+            rel = f"pmc/oa_bulk/{group}/{fmt}/{name}"
+            tasks.append(Task(
+                source=source,
+                filename=name,
+                url=url + name,
+                dest=self.cfg.data_root / rel,
+                rel_path=rel,
+                md5_url=None,
+                immutable=True,
+            ))
+        return tasks
+    def plan(self) -> list[Task]:
+        tasks: list[Task] = []
+        for group in self.cfg.pmc_groups:
+            for fmt in self.cfg.pmc_formats:
+                tasks.extend(self._plan_bulk_dir(group, fmt))
+        tasks.append(Task(
+            source="pmc_idmap",
+            filename="oa_file_list.csv",
+            url=f"{PMC_BASE}/oa_file_list.csv",
+            dest=self.cfg.data_root / "pmc" / "oa_file_list.csv",
+            rel_path="pmc/oa_file_list.csv",
+            md5_url=None,
+            immutable=False,
+        ))
+        return tasks

litsync/sources/pubmed.py ADDED Viewed

@@ -0,0 +1,37 @@
+from __future__ import annotations
+from litsync.config import Config
+from litsync.http import HttpClient
+from litsync.sources import Task
+PUBMED_BASE = "https://ftp.ncbi.nlm.nih.gov/pubmed"
+class PubMedSource:
+    """PubMed baseline + daily update files. Files are immutable and have .md5 sidecars."""
+    SUFFIX = ".xml.gz"
+    def __init__(self, cfg: Config, http: HttpClient):
+        self.cfg, self.http = cfg, http
+    def _plan_dir(self, subdir: str) -> list[Task]:
+        source = f"pubmed_{subdir}"
+        url = f"{PUBMED_BASE}/{subdir}/"
+        names = [n for n in self.http.list_dir(url) if n.endswith(self.SUFFIX)]
+        tasks = []
+        for name in sorted(names):
+            rel = f"pubmed/{subdir}/{name}"
+            tasks.append(Task(
+                source=source,
+                filename=name,
+                url=url + name,
+                dest=self.cfg.data_root / rel,
+                rel_path=rel,
+                md5_url=f"{url}{name}.md5",
+                immutable=True,
+            ))
+        return tasks
+    def plan(self) -> list[Task]:
+        return self._plan_dir("baseline") + self._plan_dir("updatefiles")

litsync/state.py ADDED Viewed

@@ -0,0 +1,147 @@
+from __future__ import annotations
+import contextlib
+import dataclasses
+import sqlite3
+import threading
+from pathlib import Path
+from typing import Optional
+from litsync.utils import utcnow
+@dataclasses.dataclass
+class FileRecord:
+    source: str
+    filename: str
+    url: str
+    rel_path: str
+    remote_size: Optional[int] = None
+    remote_mtime: Optional[str] = None
+    etag: Optional[str] = None
+    md5: Optional[str] = None
+    local_md5: Optional[str] = None
+    status: str = "pending"
+    attempts: int = 0
+    error: Optional[str] = None
+class StateDB:
+    """Thread-safe-enough SQLite wrapper (single connection guarded by a lock)."""
+    def __init__(self, path: Path):
+        path.parent.mkdir(parents=True, exist_ok=True)
+        self._lock = threading.Lock()
+        self._conn = sqlite3.connect(str(path), check_same_thread=False, timeout=30)
+        self._conn.row_factory = sqlite3.Row
+        with self._lock:
+            self._conn.execute("PRAGMA journal_mode=WAL")
+            self._conn.execute("PRAGMA synchronous=NORMAL")
+            self._conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS files (
+                    source        TEXT NOT NULL,
+                    filename      TEXT NOT NULL,
+                    url           TEXT NOT NULL,
+                    rel_path      TEXT NOT NULL,
+                    remote_size   INTEGER,
+                    remote_mtime  TEXT,
+                    etag          TEXT,
+                    md5           TEXT,
+                    local_md5     TEXT,
+                    status        TEXT NOT NULL DEFAULT 'pending',
+                    attempts      INTEGER NOT NULL DEFAULT 0,
+                    error         TEXT,
+                    article_count INTEGER,
+                    first_seen    TEXT NOT NULL,
+                    last_checked  TEXT NOT NULL,
+                    completed_at  TEXT,
+                    PRIMARY KEY (source, filename)
+                )
+                """
+            )
+            with contextlib.suppress(sqlite3.OperationalError):
+                self._conn.execute("ALTER TABLE files ADD COLUMN article_count INTEGER")
+            self._conn.commit()
+    def get(self, source: str, filename: str) -> Optional[sqlite3.Row]:
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT * FROM files WHERE source=? AND filename=?", (source, filename)
+            )
+            return cur.fetchone()
+    def all_sources(self) -> set[str]:
+        with self._lock:
+            cur = self._conn.execute("SELECT DISTINCT source FROM files")
+            return {r["source"] for r in cur.fetchall()}
+    def known_filenames(self, source: str) -> set[str]:
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT filename FROM files WHERE source=?", (source,)
+            )
+            return {r["filename"] for r in cur.fetchall()}
+    def upsert_seen(self, rec: FileRecord) -> None:
+        now = utcnow()
+        with self._lock:
+            self._conn.execute(
+                """
+                INSERT INTO files (source, filename, url, rel_path, first_seen, last_checked)
+                VALUES (?,?,?,?,?,?)
+                ON CONFLICT(source, filename) DO UPDATE SET
+                    url=excluded.url, rel_path=excluded.rel_path, last_checked=excluded.last_checked
+                """,
+                (rec.source, rec.filename, rec.url, rec.rel_path, now, now),
+            )
+            self._conn.commit()
+    def mark(self, source: str, filename: str, **fields) -> None:
+        if not fields:
+            return
+        fields["last_checked"] = utcnow()
+        if fields.get("status") in ("done", "verified"):
+            fields["completed_at"] = utcnow()
+        cols = ", ".join(f"{k}=?" for k in fields)
+        vals = list(fields.values()) + [source, filename]
+        with self._lock:
+            self._conn.execute(
+                f"UPDATE files SET {cols} WHERE source=? AND filename=?", vals
+            )
+            self._conn.commit()
+    def summary(self) -> dict[str, int]:
+        with self._lock:
+            cur = self._conn.execute("SELECT status, COUNT(*) c FROM files GROUP BY status")
+            return {r["status"]: r["c"] for r in cur.fetchall()}
+    def summary_by_source(self) -> dict[str, dict]:
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT source, COUNT(*) c, COALESCE(SUM(remote_size),0) bytes, "
+                "COALESCE(SUM(article_count),0) articles, "
+                "SUM(article_count IS NOT NULL) counted, "
+                "SUM(status='verified') verified, SUM(status='failed') failed "
+                "FROM files GROUP BY source"
+            )
+            return {
+                r["source"]: {
+                    "files": r["c"], "bytes": r["bytes"],
+                    "articles": r["articles"], "counted": r["counted"] or 0,
+                    "verified": r["verified"] or 0, "failed": r["failed"] or 0,
+                }
+                for r in cur.fetchall()
+            }
+    def files_missing_counts(self) -> list[tuple[str, str, str]]:
+        with self._lock:
+            cur = self._conn.execute(
+                "SELECT source, filename, rel_path FROM files "
+                "WHERE article_count IS NULL AND status='verified'"
+            )
+            return [(r["source"], r["filename"], r["rel_path"]) for r in cur.fetchall()]
+    def close(self) -> None:
+        with self._lock:
+            self._conn.close()