litsync 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litsync-0.0.2/PKG-INFO +125 -0
- litsync-0.0.2/README.md +103 -0
- litsync-0.0.2/pyproject.toml +40 -0
- litsync-0.0.2/setup.cfg +4 -0
- litsync-0.0.2/src/litsync/__init__.py +3 -0
- litsync-0.0.2/src/litsync/__main__.py +3 -0
- litsync-0.0.2/src/litsync/cli.py +143 -0
- litsync-0.0.2/src/litsync/config.py +42 -0
- litsync-0.0.2/src/litsync/extract.py +409 -0
- litsync-0.0.2/src/litsync/http.py +127 -0
- litsync-0.0.2/src/litsync/sources/__init__.py +17 -0
- litsync-0.0.2/src/litsync/sources/clinicaltrials.py +28 -0
- litsync-0.0.2/src/litsync/sources/fda.py +48 -0
- litsync-0.0.2/src/litsync/sources/pmc.py +51 -0
- litsync-0.0.2/src/litsync/sources/pubmed.py +37 -0
- litsync-0.0.2/src/litsync/state.py +147 -0
- litsync-0.0.2/src/litsync/sync.py +334 -0
- litsync-0.0.2/src/litsync/ui.py +232 -0
- litsync-0.0.2/src/litsync/utils.py +122 -0
- litsync-0.0.2/src/litsync.egg-info/PKG-INFO +125 -0
- litsync-0.0.2/src/litsync.egg-info/SOURCES.txt +23 -0
- litsync-0.0.2/src/litsync.egg-info/dependency_links.txt +1 -0
- litsync-0.0.2/src/litsync.egg-info/entry_points.txt +3 -0
- litsync-0.0.2/src/litsync.egg-info/requires.txt +2 -0
- litsync-0.0.2/src/litsync.egg-info/top_level.txt +1 -0
litsync-0.0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: litsync
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov
|
|
5
|
+
Author: Literature Downloader Contributors
|
|
6
|
+
Author-email: Rahul Brahma <rahul.brahma@uni-greifswald.de>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/takshan/litsync
|
|
9
|
+
Project-URL: Repository, https://github.com/takshan/litsync
|
|
10
|
+
Keywords: pubmed,pmc,fda,clinicaltrials,biomedical,mirror
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: requests>=2.31
|
|
21
|
+
Requires-Dist: rich>=13.0
|
|
22
|
+
|
|
23
|
+
# litsync — incremental PubMed + PMC + FDA + ClinicalTrials.gov mirror
|
|
24
|
+
|
|
25
|
+
A modern, daily-runnable CLI for mirroring bulk biomedical datasets. It tracks every
|
|
26
|
+
file in a SQLite state DB so re-runs do the minimum work: already-verified immutable
|
|
27
|
+
files are skipped with no network request beyond the directory/manifest listing.
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or use the Makefile:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
make install
|
|
39
|
+
make dev
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick start
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
litsync --data-root /data/literature --email you@institute.org
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Common options:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
litsync --data-root /data/literature --email you@institute.org \
|
|
52
|
+
--sources pubmed pmc fda clinicaltrials \
|
|
53
|
+
--fda-endpoints drug/event drug/label
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
--sources pubmed pmc fda clinicaltrials # which corpora (default: all four)
|
|
58
|
+
--fda-endpoints drug/event drug/label # default: all openFDA endpoints
|
|
59
|
+
--pmc-groups oa_comm oa_noncomm oa_other
|
|
60
|
+
--pmc-formats xml txt # default: xml
|
|
61
|
+
--workers 4 # concurrent downloads (keep modest; be polite)
|
|
62
|
+
--dry-run # plan only, download nothing
|
|
63
|
+
--reverify # re-download local files (integrity audit)
|
|
64
|
+
--prune # delete local files no longer on the server
|
|
65
|
+
--count-articles # count articles in already-downloaded files (no network)
|
|
66
|
+
--no-rich # disable Rich progress bars / tables
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## On-disk layout
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
/data/literature/
|
|
73
|
+
pubmed/baseline/ pubmed26nXXXX.xml.gz (+ .md5 verified)
|
|
74
|
+
pubmed/updatefiles/ daily citation deltas
|
|
75
|
+
pmc/oa_bulk/<group>/<fmt>/ baseline + dated incremental .tar.gz
|
|
76
|
+
pmc/oa_file_list.csv PMCID <-> PMID id map
|
|
77
|
+
fda/<category>/<endpoint>/ openFDA bulk snapshot zips + extracted JSON
|
|
78
|
+
clinicaltrials/ctg-public-xml.zip ClinicalTrials.gov full XML dump
|
|
79
|
+
clinicaltrials/ctg-public-xml/ extracted study XML files
|
|
80
|
+
_state/state.sqlite file ledger (status, size, mtime, md5, etag, attempts)
|
|
81
|
+
_state/logs/ dated run logs
|
|
82
|
+
_state/litsync.lock run lock (prevents overlapping cron runs)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Cron (daily 02:30)
|
|
86
|
+
|
|
87
|
+
```cron
|
|
88
|
+
30 2 * * * /path/to/venv/bin/litsync --data-root /data/literature --email you@institute.org >> /data/literature/_state/cron.log 2>&1
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Extract corpus to sharded JSONL
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
litsync-extract --data-root /data/literature --out /data/corpus \
|
|
95
|
+
--sources pubmed pmc fda clinicaltrials
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Or with Make:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
make extract DATA_ROOT=/data/literature CORPUS_OUT=/data/corpus
|
|
102
|
+
make extract-test DATA_ROOT=/data/literature
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Integrity model
|
|
106
|
+
|
|
107
|
+
- **PubMed**: every `.xml.gz` is verified against its NCBI `.md5` sidecar.
|
|
108
|
+
- **PMC**: bulk packages have no md5 sidecar, so they are verified by `Content-Length`
|
|
109
|
+
and an `ETag` is recorded for change detection.
|
|
110
|
+
- **openFDA / ClinicalTrials.gov**: these sources publish full snapshots. The downloader
|
|
111
|
+
detects changed snapshots via `ETag` / `Last-Modified` / `Content-Length` and only
|
|
112
|
+
re-downloads when the snapshot changes. When a snapshot changes it is extracted
|
|
113
|
+
again next to the zip file.
|
|
114
|
+
- Downloads are atomic (`.part` -> rename) and resumable via HTTP Range.
|
|
115
|
+
- Exit code is non-zero if any file failed, so cron/monitoring can alert.
|
|
116
|
+
|
|
117
|
+
## Notes on sources
|
|
118
|
+
|
|
119
|
+
- **openFDA** bulk data is zipped JSON. The manifest is fetched from `https://api.fda.gov/download.json`.
|
|
120
|
+
Each endpoint partition becomes one downloaded/extracted unit.
|
|
121
|
+
- **ClinicalTrials.gov** bulk data is the full public XML dump from
|
|
122
|
+
`https://clinicaltrials.gov/api/legacy/public-xml?format=zip`. One XML file per study.
|
|
123
|
+
- Both sources are snapshots, not daily deltas. Daily runs are still cheap because unchanged
|
|
124
|
+
snapshots are skipped; changed snapshots are replaced in full.
|
|
125
|
+
|
litsync-0.0.2/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# litsync — incremental PubMed + PMC + FDA + ClinicalTrials.gov mirror
|
|
2
|
+
|
|
3
|
+
A modern, daily-runnable CLI for mirroring bulk biomedical datasets. It tracks every
|
|
4
|
+
file in a SQLite state DB so re-runs do the minimum work: already-verified immutable
|
|
5
|
+
files are skipped with no network request beyond the directory/manifest listing.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install -e .
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or use the Makefile:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
make install
|
|
17
|
+
make dev
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick start
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
litsync --data-root /data/literature --email you@institute.org
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Common options:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
litsync --data-root /data/literature --email you@institute.org \
|
|
30
|
+
--sources pubmed pmc fda clinicaltrials \
|
|
31
|
+
--fda-endpoints drug/event drug/label
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
--sources pubmed pmc fda clinicaltrials # which corpora (default: all four)
|
|
36
|
+
--fda-endpoints drug/event drug/label # default: all openFDA endpoints
|
|
37
|
+
--pmc-groups oa_comm oa_noncomm oa_other
|
|
38
|
+
--pmc-formats xml txt # default: xml
|
|
39
|
+
--workers 4 # concurrent downloads (keep modest; be polite)
|
|
40
|
+
--dry-run # plan only, download nothing
|
|
41
|
+
--reverify # re-download local files (integrity audit)
|
|
42
|
+
--prune # delete local files no longer on the server
|
|
43
|
+
--count-articles # count articles in already-downloaded files (no network)
|
|
44
|
+
--no-rich # disable Rich progress bars / tables
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## On-disk layout
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
/data/literature/
|
|
51
|
+
pubmed/baseline/ pubmed26nXXXX.xml.gz (+ .md5 verified)
|
|
52
|
+
pubmed/updatefiles/ daily citation deltas
|
|
53
|
+
pmc/oa_bulk/<group>/<fmt>/ baseline + dated incremental .tar.gz
|
|
54
|
+
pmc/oa_file_list.csv PMCID <-> PMID id map
|
|
55
|
+
fda/<category>/<endpoint>/ openFDA bulk snapshot zips + extracted JSON
|
|
56
|
+
clinicaltrials/ctg-public-xml.zip ClinicalTrials.gov full XML dump
|
|
57
|
+
clinicaltrials/ctg-public-xml/ extracted study XML files
|
|
58
|
+
_state/state.sqlite file ledger (status, size, mtime, md5, etag, attempts)
|
|
59
|
+
_state/logs/ dated run logs
|
|
60
|
+
_state/litsync.lock run lock (prevents overlapping cron runs)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Cron (daily 02:30)
|
|
64
|
+
|
|
65
|
+
```cron
|
|
66
|
+
30 2 * * * /path/to/venv/bin/litsync --data-root /data/literature --email you@institute.org >> /data/literature/_state/cron.log 2>&1
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Extract corpus to sharded JSONL
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
litsync-extract --data-root /data/literature --out /data/corpus \
|
|
73
|
+
--sources pubmed pmc fda clinicaltrials
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Or with Make:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
make extract DATA_ROOT=/data/literature CORPUS_OUT=/data/corpus
|
|
80
|
+
make extract-test DATA_ROOT=/data/literature
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Integrity model
|
|
84
|
+
|
|
85
|
+
- **PubMed**: every `.xml.gz` is verified against its NCBI `.md5` sidecar.
|
|
86
|
+
- **PMC**: bulk packages have no md5 sidecar, so they are verified by `Content-Length`
|
|
87
|
+
and an `ETag` is recorded for change detection.
|
|
88
|
+
- **openFDA / ClinicalTrials.gov**: these sources publish full snapshots. The downloader
|
|
89
|
+
detects changed snapshots via `ETag` / `Last-Modified` / `Content-Length` and only
|
|
90
|
+
re-downloads when the snapshot changes. When a snapshot changes it is extracted
|
|
91
|
+
again next to the zip file.
|
|
92
|
+
- Downloads are atomic (`.part` -> rename) and resumable via HTTP Range.
|
|
93
|
+
- Exit code is non-zero if any file failed, so cron/monitoring can alert.
|
|
94
|
+
|
|
95
|
+
## Notes on sources
|
|
96
|
+
|
|
97
|
+
- **openFDA** bulk data is zipped JSON. The manifest is fetched from `https://api.fda.gov/download.json`.
|
|
98
|
+
Each endpoint partition becomes one downloaded/extracted unit.
|
|
99
|
+
- **ClinicalTrials.gov** bulk data is the full public XML dump from
|
|
100
|
+
`https://clinicaltrials.gov/api/legacy/public-xml?format=zip`. One XML file per study.
|
|
101
|
+
- Both sources are snapshots, not daily deltas. Daily runs are still cheap because unchanged
|
|
102
|
+
snapshots are skipped; changed snapshots are replaced in full.
|
|
103
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "litsync"
|
|
7
|
+
version = "0.0.2"
|
|
8
|
+
description = "Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Rahul Brahma", email = "rahul.brahma@uni-greifswald.de"},
|
|
14
|
+
{name = "Literature Downloader Contributors"},
|
|
15
|
+
]
|
|
16
|
+
keywords = ["pubmed", "pmc", "fda", "clinicaltrials", "biomedical", "mirror"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"requests>=2.31",
|
|
28
|
+
"rich>=13.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
litsync = "litsync.cli:main"
|
|
33
|
+
litsync-extract = "litsync.cli:extract_command"
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/takshan/litsync"
|
|
37
|
+
Repository = "https://github.com/takshan/litsync"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["src"]
|
litsync-0.0.2/setup.cfg
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
|
|
12
|
+
from litsync.config import Config
|
|
13
|
+
from litsync.extract import run_extraction
|
|
14
|
+
from litsync.sync import Syncer
|
|
15
|
+
from litsync.ui import RichUI, UI
|
|
16
|
+
from litsync.utils import run_lock
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
LOG = logging.getLogger("litsync")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def setup_logging(log_dir: Path, verbose: bool = False) -> None:
|
|
23
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
logfile = log_dir / f"litsync_{__import__('datetime').date.today().isoformat()}.log"
|
|
25
|
+
|
|
26
|
+
file_handler = logging.FileHandler(logfile)
|
|
27
|
+
file_handler.setFormatter(logging.Formatter(
|
|
28
|
+
"%(asctime)s %(levelname)-7s %(message)s"
|
|
29
|
+
))
|
|
30
|
+
|
|
31
|
+
rich_handler = RichHandler(rich_tracebacks=True, show_path=False)
|
|
32
|
+
rich_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
level=logging.DEBUG if verbose else logging.INFO,
|
|
36
|
+
handlers=[rich_handler, file_handler],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_args(argv: Optional[list[str]] = None) -> Config:
|
|
41
|
+
ap = argparse.ArgumentParser(
|
|
42
|
+
prog="litsync",
|
|
43
|
+
description="Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov",
|
|
44
|
+
)
|
|
45
|
+
ap.add_argument("--data-root", required=True, type=Path,
|
|
46
|
+
help="root directory for the local mirror")
|
|
47
|
+
ap.add_argument("--email", default=os.environ.get("NCBI_EMAIL", ""),
|
|
48
|
+
help="contact email (sent in User-Agent)")
|
|
49
|
+
ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
|
|
50
|
+
choices=["pubmed", "pmc", "fda", "clinicaltrials"])
|
|
51
|
+
ap.add_argument("--pmc-groups", nargs="+", default=["oa_comm", "oa_noncomm", "oa_other"],
|
|
52
|
+
choices=["oa_comm", "oa_noncomm", "oa_other"])
|
|
53
|
+
ap.add_argument("--pmc-formats", nargs="+", default=["xml"],
|
|
54
|
+
choices=["xml", "txt"])
|
|
55
|
+
ap.add_argument("--fda-endpoints", nargs="+", default=None,
|
|
56
|
+
help="openFDA endpoints to mirror, e.g. 'drug/event drug/label'; default: all")
|
|
57
|
+
ap.add_argument("--workers", type=int, default=4)
|
|
58
|
+
ap.add_argument("--max-retries", type=int, default=5)
|
|
59
|
+
ap.add_argument("--timeout", type=int, default=60)
|
|
60
|
+
ap.add_argument("--dry-run", action="store_true",
|
|
61
|
+
help="plan only, download nothing")
|
|
62
|
+
ap.add_argument("--reverify", action="store_true",
|
|
63
|
+
help="re-download already-downloaded files to verify integrity")
|
|
64
|
+
ap.add_argument("--prune", action="store_true",
|
|
65
|
+
help="delete local files no longer present on the server")
|
|
66
|
+
ap.add_argument("--count-articles", action="store_true",
|
|
67
|
+
help="count articles in already-downloaded local files and exit "
|
|
68
|
+
"(no network); backfills the per-source article totals")
|
|
69
|
+
ap.add_argument("--no-rich", action="store_true",
|
|
70
|
+
help="disable Rich progress bars and use plain text output")
|
|
71
|
+
ap.add_argument("--verbose", "-v", action="store_true",
|
|
72
|
+
help="enable debug logging")
|
|
73
|
+
a = ap.parse_args(argv)
|
|
74
|
+
if not a.email:
|
|
75
|
+
ap.error("provide --email or set NCBI_EMAIL")
|
|
76
|
+
return Config(
|
|
77
|
+
data_root=a.data_root.expanduser().resolve(),
|
|
78
|
+
email=a.email,
|
|
79
|
+
sources=tuple(a.sources),
|
|
80
|
+
pmc_groups=tuple(a.pmc_groups),
|
|
81
|
+
pmc_formats=tuple(a.pmc_formats),
|
|
82
|
+
workers=max(1, a.workers),
|
|
83
|
+
max_retries=a.max_retries,
|
|
84
|
+
timeout=a.timeout,
|
|
85
|
+
dry_run=a.dry_run,
|
|
86
|
+
reverify=a.reverify,
|
|
87
|
+
prune=a.prune,
|
|
88
|
+
count_articles=a.count_articles,
|
|
89
|
+
fda_endpoints=tuple(a.fda_endpoints) if a.fda_endpoints else None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def sync_command(cfg: Config, ui: UI) -> int:
|
|
94
|
+
cfg.data_root.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
setup_logging(cfg.log_dir, verbose=False)
|
|
96
|
+
LOG.info("litsync starting | root=%s sources=%s dry_run=%s count_articles=%s",
|
|
97
|
+
cfg.data_root, cfg.sources, cfg.dry_run, cfg.count_articles)
|
|
98
|
+
with run_lock(cfg.lock_path):
|
|
99
|
+
syncer = Syncer(cfg, ui)
|
|
100
|
+
if cfg.count_articles:
|
|
101
|
+
return syncer.backfill_counts()
|
|
102
|
+
return syncer.run()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def extract_command(argv: Optional[list[str]] = None) -> int:
|
|
106
|
+
ap = argparse.ArgumentParser(
|
|
107
|
+
prog="litsync-extract",
|
|
108
|
+
description="Extract litsync mirror into sharded JSONL",
|
|
109
|
+
)
|
|
110
|
+
ap.add_argument("--data-root", type=Path, default=Path("./data/literature"))
|
|
111
|
+
ap.add_argument("--out", type=Path, default=Path("./data/corpus"))
|
|
112
|
+
ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
|
|
113
|
+
choices=["pubmed", "pmc", "fda", "clinicaltrials"])
|
|
114
|
+
ap.add_argument("--shard-size-mb", type=int, default=256)
|
|
115
|
+
ap.add_argument("--limit", type=int, default=None)
|
|
116
|
+
ap.add_argument("--verbose", "-v", action="store_true")
|
|
117
|
+
args = ap.parse_args(argv)
|
|
118
|
+
|
|
119
|
+
logging.basicConfig(
|
|
120
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
121
|
+
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
122
|
+
)
|
|
123
|
+
run_extraction(
|
|
124
|
+
args.data_root.expanduser().resolve(),
|
|
125
|
+
args.out.expanduser().resolve(),
|
|
126
|
+
args.sources,
|
|
127
|
+
args.shard_size_mb,
|
|
128
|
+
args.limit,
|
|
129
|
+
)
|
|
130
|
+
return 0
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
134
|
+
# Pre-scan argv for --no-rich so we can choose the UI before argparse runs.
|
|
135
|
+
raw = argv if argv is not None else sys.argv[1:]
|
|
136
|
+
use_plain = "--no-rich" in raw
|
|
137
|
+
cfg = parse_args(argv)
|
|
138
|
+
ui = UI() if use_plain else RichUI()
|
|
139
|
+
return sync_command(cfg, ui)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
if __name__ == "__main__":
|
|
143
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEFAULT_WORKERS = 4
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class Config:
|
|
13
|
+
data_root: Path
|
|
14
|
+
email: str
|
|
15
|
+
sources: tuple[str, ...]
|
|
16
|
+
pmc_groups: tuple[str, ...]
|
|
17
|
+
pmc_formats: tuple[str, ...]
|
|
18
|
+
workers: int = DEFAULT_WORKERS
|
|
19
|
+
max_retries: int = 5
|
|
20
|
+
backoff_base: float = 2.0
|
|
21
|
+
timeout: int = 60
|
|
22
|
+
dry_run: bool = False
|
|
23
|
+
reverify: bool = False
|
|
24
|
+
prune: bool = False
|
|
25
|
+
count_articles: bool = False
|
|
26
|
+
fda_endpoints: Optional[tuple[str, ...]] = None
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def state_dir(self) -> Path:
|
|
30
|
+
return self.data_root / "_state"
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def db_path(self) -> Path:
|
|
34
|
+
return self.state_dir / "state.sqlite"
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def lock_path(self) -> Path:
|
|
38
|
+
return self.state_dir / "litsync.lock"
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def log_dir(self) -> Path:
|
|
42
|
+
return self.state_dir / "logs"
|