litsync 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litsync-0.0.2/PKG-INFO ADDED
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.4
2
+ Name: litsync
3
+ Version: 0.0.2
4
+ Summary: Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov
5
+ Author: Literature Downloader Contributors
6
+ Author-email: Rahul Brahma <rahul.brahma@uni-greifswald.de>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/takshan/litsync
9
+ Project-URL: Repository, https://github.com/takshan/litsync
10
+ Keywords: pubmed,pmc,fda,clinicaltrials,biomedical,mirror
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: requests>=2.31
21
+ Requires-Dist: rich>=13.0
22
+
23
+ # litsync — incremental PubMed + PMC + FDA + ClinicalTrials.gov mirror
24
+
25
+ A modern, daily-runnable CLI for mirroring bulk biomedical datasets. It tracks every
26
+ file in a SQLite state DB so re-runs do the minimum work: already-verified immutable
27
+ files are skipped with no network request beyond the directory/manifest listing.
28
+
29
+ ## Install
30
+
31
+ ```bash
32
+ pip install -e .
33
+ ```
34
+
35
+ Or use the Makefile:
36
+
37
+ ```bash
38
+ make install
39
+ make dev
40
+ ```
41
+
42
+ ## Quick start
43
+
44
+ ```bash
45
+ litsync --data-root /data/literature --email you@institute.org
46
+ ```
47
+
48
+ Common options:
49
+
50
+ ```bash
51
+ litsync --data-root /data/literature --email you@institute.org \
52
+ --sources pubmed pmc fda clinicaltrials \
53
+ --fda-endpoints drug/event drug/label
54
+ ```
55
+
56
+ ```bash
57
+ --sources pubmed pmc fda clinicaltrials # which corpora (default: all four)
58
+ --fda-endpoints drug/event drug/label # default: all openFDA endpoints
59
+ --pmc-groups oa_comm oa_noncomm oa_other
60
+ --pmc-formats xml txt # default: xml
61
+ --workers 4 # concurrent downloads (keep modest; be polite)
62
+ --dry-run # plan only, download nothing
63
+ --reverify # re-download local files (integrity audit)
64
+ --prune # delete local files no longer on the server
65
+ --count-articles # count articles in already-downloaded files (no network)
66
+ --no-rich # disable Rich progress bars / tables
67
+ ```
68
+
69
+ ## On-disk layout
70
+
71
+ ```
72
+ /data/literature/
73
+ pubmed/baseline/ pubmed26nXXXX.xml.gz (+ .md5 verified)
74
+ pubmed/updatefiles/ daily citation deltas
75
+ pmc/oa_bulk/<group>/<fmt>/ baseline + dated incremental .tar.gz
76
+ pmc/oa_file_list.csv PMCID <-> PMID id map
77
+ fda/<category>/<endpoint>/ openFDA bulk snapshot zips + extracted JSON
78
+ clinicaltrials/ctg-public-xml.zip ClinicalTrials.gov full XML dump
79
+ clinicaltrials/ctg-public-xml/ extracted study XML files
80
+ _state/state.sqlite file ledger (status, size, mtime, md5, etag, attempts)
81
+ _state/logs/ dated run logs
82
+ _state/litsync.lock run lock (prevents overlapping cron runs)
83
+ ```
84
+
85
+ ## Cron (daily 02:30)
86
+
87
+ ```cron
88
+ 30 2 * * * /path/to/venv/bin/litsync --data-root /data/literature --email you@institute.org >> /data/literature/_state/cron.log 2>&1
89
+ ```
90
+
91
+ ## Extract corpus to sharded JSONL
92
+
93
+ ```bash
94
+ litsync-extract --data-root /data/literature --out /data/corpus \
95
+ --sources pubmed pmc fda clinicaltrials
96
+ ```
97
+
98
+ Or with Make:
99
+
100
+ ```bash
101
+ make extract DATA_ROOT=/data/literature CORPUS_OUT=/data/corpus
102
+ make extract-test DATA_ROOT=/data/literature
103
+ ```
104
+
105
+ ## Integrity model
106
+
107
+ - **PubMed**: every `.xml.gz` is verified against its NCBI `.md5` sidecar.
108
+ - **PMC**: bulk packages have no md5 sidecar, so they are verified by `Content-Length`
109
+ and an `ETag` is recorded for change detection.
110
+ - **openFDA / ClinicalTrials.gov**: these sources publish full snapshots. The downloader
111
+ detects changed snapshots via `ETag` / `Last-Modified` / `Content-Length` and only
112
+ re-downloads when the snapshot changes. When a snapshot changes it is extracted
113
+ again next to the zip file.
114
+ - Downloads are atomic (`.part` -> rename) and resumable via HTTP Range.
115
+ - Exit code is non-zero if any file failed, so cron/monitoring can alert.
116
+
117
+ ## Notes on sources
118
+
119
+ - **openFDA** bulk data is zipped JSON. The manifest is fetched from `https://api.fda.gov/download.json`.
120
+ Each endpoint partition becomes one downloaded/extracted unit.
121
+ - **ClinicalTrials.gov** bulk data is the full public XML dump from
122
+ `https://clinicaltrials.gov/api/legacy/public-xml?format=zip`. One XML file per study.
123
+ - Both sources are snapshots, not daily deltas. Daily runs are still cheap because unchanged
124
+ snapshots are skipped; changed snapshots are replaced in full.
125
+
@@ -0,0 +1,103 @@
1
+ # litsync — incremental PubMed + PMC + FDA + ClinicalTrials.gov mirror
2
+
3
+ A modern, daily-runnable CLI for mirroring bulk biomedical datasets. It tracks every
4
+ file in a SQLite state DB so re-runs do the minimum work: already-verified immutable
5
+ files are skipped with no network request beyond the directory/manifest listing.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install -e .
11
+ ```
12
+
13
+ Or use the Makefile:
14
+
15
+ ```bash
16
+ make install
17
+ make dev
18
+ ```
19
+
20
+ ## Quick start
21
+
22
+ ```bash
23
+ litsync --data-root /data/literature --email you@institute.org
24
+ ```
25
+
26
+ Common options:
27
+
28
+ ```bash
29
+ litsync --data-root /data/literature --email you@institute.org \
30
+ --sources pubmed pmc fda clinicaltrials \
31
+ --fda-endpoints drug/event drug/label
32
+ ```
33
+
34
+ ```bash
35
+ --sources pubmed pmc fda clinicaltrials # which corpora (default: all four)
36
+ --fda-endpoints drug/event drug/label # default: all openFDA endpoints
37
+ --pmc-groups oa_comm oa_noncomm oa_other
38
+ --pmc-formats xml txt # default: xml
39
+ --workers 4 # concurrent downloads (keep modest; be polite)
40
+ --dry-run # plan only, download nothing
41
+ --reverify # re-download local files (integrity audit)
42
+ --prune # delete local files no longer on the server
43
+ --count-articles # count articles in already-downloaded files (no network)
44
+ --no-rich # disable Rich progress bars / tables
45
+ ```
46
+
47
+ ## On-disk layout
48
+
49
+ ```
50
+ /data/literature/
51
+ pubmed/baseline/ pubmed26nXXXX.xml.gz (+ .md5 verified)
52
+ pubmed/updatefiles/ daily citation deltas
53
+ pmc/oa_bulk/<group>/<fmt>/ baseline + dated incremental .tar.gz
54
+ pmc/oa_file_list.csv PMCID <-> PMID id map
55
+ fda/<category>/<endpoint>/ openFDA bulk snapshot zips + extracted JSON
56
+ clinicaltrials/ctg-public-xml.zip ClinicalTrials.gov full XML dump
57
+ clinicaltrials/ctg-public-xml/ extracted study XML files
58
+ _state/state.sqlite file ledger (status, size, mtime, md5, etag, attempts)
59
+ _state/logs/ dated run logs
60
+ _state/litsync.lock run lock (prevents overlapping cron runs)
61
+ ```
62
+
63
+ ## Cron (daily 02:30)
64
+
65
+ ```cron
66
+ 30 2 * * * /path/to/venv/bin/litsync --data-root /data/literature --email you@institute.org >> /data/literature/_state/cron.log 2>&1
67
+ ```
68
+
69
+ ## Extract corpus to sharded JSONL
70
+
71
+ ```bash
72
+ litsync-extract --data-root /data/literature --out /data/corpus \
73
+ --sources pubmed pmc fda clinicaltrials
74
+ ```
75
+
76
+ Or with Make:
77
+
78
+ ```bash
79
+ make extract DATA_ROOT=/data/literature CORPUS_OUT=/data/corpus
80
+ make extract-test DATA_ROOT=/data/literature
81
+ ```
82
+
83
+ ## Integrity model
84
+
85
+ - **PubMed**: every `.xml.gz` is verified against its NCBI `.md5` sidecar.
86
+ - **PMC**: bulk packages have no md5 sidecar, so they are verified by `Content-Length`
87
+ and an `ETag` is recorded for change detection.
88
+ - **openFDA / ClinicalTrials.gov**: these sources publish full snapshots. The downloader
89
+ detects changed snapshots via `ETag` / `Last-Modified` / `Content-Length` and only
90
+ re-downloads when the snapshot changes. When a snapshot changes it is extracted
91
+ again next to the zip file.
92
+ - Downloads are atomic (`.part` -> rename) and resumable via HTTP Range.
93
+ - Exit code is non-zero if any file failed, so cron/monitoring can alert.
94
+
95
+ ## Notes on sources
96
+
97
+ - **openFDA** bulk data is zipped JSON. The manifest is fetched from `https://api.fda.gov/download.json`.
98
+ Each endpoint partition becomes one downloaded/extracted unit.
99
+ - **ClinicalTrials.gov** bulk data is the full public XML dump from
100
+ `https://clinicaltrials.gov/api/legacy/public-xml?format=zip`. One XML file per study.
101
+ - Both sources are snapshots, not daily deltas. Daily runs are still cheap because unchanged
102
+ snapshots are skipped; changed snapshots are replaced in full.
103
+
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "litsync"
7
+ version = "0.0.2"
8
+ description = "Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Rahul Brahma", email = "rahul.brahma@uni-greifswald.de"},
14
+ {name = "Literature Downloader Contributors"},
15
+ ]
16
+ keywords = ["pubmed", "pmc", "fda", "clinicaltrials", "biomedical", "mirror"]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Intended Audience :: Science/Research",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ ]
26
+ dependencies = [
27
+ "requests>=2.31",
28
+ "rich>=13.0",
29
+ ]
30
+
31
+ [project.scripts]
32
+ litsync = "litsync.cli:main"
33
+ litsync-extract = "litsync.cli:extract_command"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/takshan/litsync"
37
+ Repository = "https://github.com/takshan/litsync"
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """litsync — incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov."""
2
+
3
+ __version__ = "0.0.2"
@@ -0,0 +1,3 @@
1
+ from litsync.cli import main
2
+
3
+ raise SystemExit(main())
@@ -0,0 +1,143 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from rich.logging import RichHandler
11
+
12
+ from litsync.config import Config
13
+ from litsync.extract import run_extraction
14
+ from litsync.sync import Syncer
15
+ from litsync.ui import RichUI, UI
16
+ from litsync.utils import run_lock
17
+
18
+
19
+ LOG = logging.getLogger("litsync")
20
+
21
+
22
+ def setup_logging(log_dir: Path, verbose: bool = False) -> None:
23
+ log_dir.mkdir(parents=True, exist_ok=True)
24
+ logfile = log_dir / f"litsync_{__import__('datetime').date.today().isoformat()}.log"
25
+
26
+ file_handler = logging.FileHandler(logfile)
27
+ file_handler.setFormatter(logging.Formatter(
28
+ "%(asctime)s %(levelname)-7s %(message)s"
29
+ ))
30
+
31
+ rich_handler = RichHandler(rich_tracebacks=True, show_path=False)
32
+ rich_handler.setFormatter(logging.Formatter("%(message)s"))
33
+
34
+ logging.basicConfig(
35
+ level=logging.DEBUG if verbose else logging.INFO,
36
+ handlers=[rich_handler, file_handler],
37
+ )
38
+
39
+
40
+ def parse_args(argv: Optional[list[str]] = None) -> Config:
41
+ ap = argparse.ArgumentParser(
42
+ prog="litsync",
43
+ description="Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov",
44
+ )
45
+ ap.add_argument("--data-root", required=True, type=Path,
46
+ help="root directory for the local mirror")
47
+ ap.add_argument("--email", default=os.environ.get("NCBI_EMAIL", ""),
48
+ help="contact email (sent in User-Agent)")
49
+ ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
50
+ choices=["pubmed", "pmc", "fda", "clinicaltrials"])
51
+ ap.add_argument("--pmc-groups", nargs="+", default=["oa_comm", "oa_noncomm", "oa_other"],
52
+ choices=["oa_comm", "oa_noncomm", "oa_other"])
53
+ ap.add_argument("--pmc-formats", nargs="+", default=["xml"],
54
+ choices=["xml", "txt"])
55
+ ap.add_argument("--fda-endpoints", nargs="+", default=None,
56
+ help="openFDA endpoints to mirror, e.g. 'drug/event drug/label'; default: all")
57
+ ap.add_argument("--workers", type=int, default=4)
58
+ ap.add_argument("--max-retries", type=int, default=5)
59
+ ap.add_argument("--timeout", type=int, default=60)
60
+ ap.add_argument("--dry-run", action="store_true",
61
+ help="plan only, download nothing")
62
+ ap.add_argument("--reverify", action="store_true",
63
+ help="re-download already-downloaded files to verify integrity")
64
+ ap.add_argument("--prune", action="store_true",
65
+ help="delete local files no longer present on the server")
66
+ ap.add_argument("--count-articles", action="store_true",
67
+ help="count articles in already-downloaded local files and exit "
68
+ "(no network); backfills the per-source article totals")
69
+ ap.add_argument("--no-rich", action="store_true",
70
+ help="disable Rich progress bars and use plain text output")
71
+ ap.add_argument("--verbose", "-v", action="store_true",
72
+ help="enable debug logging")
73
+ a = ap.parse_args(argv)
74
+ if not a.email:
75
+ ap.error("provide --email or set NCBI_EMAIL")
76
+ return Config(
77
+ data_root=a.data_root.expanduser().resolve(),
78
+ email=a.email,
79
+ sources=tuple(a.sources),
80
+ pmc_groups=tuple(a.pmc_groups),
81
+ pmc_formats=tuple(a.pmc_formats),
82
+ workers=max(1, a.workers),
83
+ max_retries=a.max_retries,
84
+ timeout=a.timeout,
85
+ dry_run=a.dry_run,
86
+ reverify=a.reverify,
87
+ prune=a.prune,
88
+ count_articles=a.count_articles,
89
+ fda_endpoints=tuple(a.fda_endpoints) if a.fda_endpoints else None,
90
+ )
91
+
92
+
93
+ def sync_command(cfg: Config, ui: UI) -> int:
94
+ cfg.data_root.mkdir(parents=True, exist_ok=True)
95
+ setup_logging(cfg.log_dir, verbose=False)
96
+ LOG.info("litsync starting | root=%s sources=%s dry_run=%s count_articles=%s",
97
+ cfg.data_root, cfg.sources, cfg.dry_run, cfg.count_articles)
98
+ with run_lock(cfg.lock_path):
99
+ syncer = Syncer(cfg, ui)
100
+ if cfg.count_articles:
101
+ return syncer.backfill_counts()
102
+ return syncer.run()
103
+
104
+
105
+ def extract_command(argv: Optional[list[str]] = None) -> int:
106
+ ap = argparse.ArgumentParser(
107
+ prog="litsync-extract",
108
+ description="Extract litsync mirror into sharded JSONL",
109
+ )
110
+ ap.add_argument("--data-root", type=Path, default=Path("./data/literature"))
111
+ ap.add_argument("--out", type=Path, default=Path("./data/corpus"))
112
+ ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
113
+ choices=["pubmed", "pmc", "fda", "clinicaltrials"])
114
+ ap.add_argument("--shard-size-mb", type=int, default=256)
115
+ ap.add_argument("--limit", type=int, default=None)
116
+ ap.add_argument("--verbose", "-v", action="store_true")
117
+ args = ap.parse_args(argv)
118
+
119
+ logging.basicConfig(
120
+ level=logging.DEBUG if args.verbose else logging.INFO,
121
+ format="%(asctime)s %(levelname)-7s %(message)s",
122
+ )
123
+ run_extraction(
124
+ args.data_root.expanduser().resolve(),
125
+ args.out.expanduser().resolve(),
126
+ args.sources,
127
+ args.shard_size_mb,
128
+ args.limit,
129
+ )
130
+ return 0
131
+
132
+
133
+ def main(argv: Optional[list[str]] = None) -> int:
134
+ # Pre-scan argv for --no-rich so we can choose the UI before argparse runs.
135
+ raw = argv if argv is not None else sys.argv[1:]
136
+ use_plain = "--no-rich" in raw
137
+ cfg = parse_args(argv)
138
+ ui = UI() if use_plain else RichUI()
139
+ return sync_command(cfg, ui)
140
+
141
+
142
+ if __name__ == "__main__":
143
+ raise SystemExit(main())
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ DEFAULT_WORKERS = 4
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class Config:
13
+ data_root: Path
14
+ email: str
15
+ sources: tuple[str, ...]
16
+ pmc_groups: tuple[str, ...]
17
+ pmc_formats: tuple[str, ...]
18
+ workers: int = DEFAULT_WORKERS
19
+ max_retries: int = 5
20
+ backoff_base: float = 2.0
21
+ timeout: int = 60
22
+ dry_run: bool = False
23
+ reverify: bool = False
24
+ prune: bool = False
25
+ count_articles: bool = False
26
+ fda_endpoints: Optional[tuple[str, ...]] = None
27
+
28
+ @property
29
+ def state_dir(self) -> Path:
30
+ return self.data_root / "_state"
31
+
32
+ @property
33
+ def db_path(self) -> Path:
34
+ return self.state_dir / "state.sqlite"
35
+
36
+ @property
37
+ def lock_path(self) -> Path:
38
+ return self.state_dir / "litsync.lock"
39
+
40
+ @property
41
+ def log_dir(self) -> Path:
42
+ return self.state_dir / "logs"