litsync 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litsync/__init__.py +3 -0
- litsync/__main__.py +3 -0
- litsync/cli.py +143 -0
- litsync/config.py +42 -0
- litsync/extract.py +409 -0
- litsync/http.py +127 -0
- litsync/sources/__init__.py +17 -0
- litsync/sources/clinicaltrials.py +28 -0
- litsync/sources/fda.py +48 -0
- litsync/sources/pmc.py +51 -0
- litsync/sources/pubmed.py +37 -0
- litsync/state.py +147 -0
- litsync/sync.py +334 -0
- litsync/ui.py +232 -0
- litsync/utils.py +122 -0
- litsync-0.0.2.dist-info/METADATA +125 -0
- litsync-0.0.2.dist-info/RECORD +20 -0
- litsync-0.0.2.dist-info/WHEEL +5 -0
- litsync-0.0.2.dist-info/entry_points.txt +3 -0
- litsync-0.0.2.dist-info/top_level.txt +1 -0
litsync/__init__.py
ADDED
litsync/__main__.py
ADDED
litsync/cli.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
|
|
12
|
+
from litsync.config import Config
|
|
13
|
+
from litsync.extract import run_extraction
|
|
14
|
+
from litsync.sync import Syncer
|
|
15
|
+
from litsync.ui import RichUI, UI
|
|
16
|
+
from litsync.utils import run_lock
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
LOG = logging.getLogger("litsync")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def setup_logging(log_dir: Path, verbose: bool = False) -> None:
|
|
23
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
logfile = log_dir / f"litsync_{__import__('datetime').date.today().isoformat()}.log"
|
|
25
|
+
|
|
26
|
+
file_handler = logging.FileHandler(logfile)
|
|
27
|
+
file_handler.setFormatter(logging.Formatter(
|
|
28
|
+
"%(asctime)s %(levelname)-7s %(message)s"
|
|
29
|
+
))
|
|
30
|
+
|
|
31
|
+
rich_handler = RichHandler(rich_tracebacks=True, show_path=False)
|
|
32
|
+
rich_handler.setFormatter(logging.Formatter("%(message)s"))
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
level=logging.DEBUG if verbose else logging.INFO,
|
|
36
|
+
handlers=[rich_handler, file_handler],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_args(argv: Optional[list[str]] = None) -> Config:
|
|
41
|
+
ap = argparse.ArgumentParser(
|
|
42
|
+
prog="litsync",
|
|
43
|
+
description="Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov",
|
|
44
|
+
)
|
|
45
|
+
ap.add_argument("--data-root", required=True, type=Path,
|
|
46
|
+
help="root directory for the local mirror")
|
|
47
|
+
ap.add_argument("--email", default=os.environ.get("NCBI_EMAIL", ""),
|
|
48
|
+
help="contact email (sent in User-Agent)")
|
|
49
|
+
ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
|
|
50
|
+
choices=["pubmed", "pmc", "fda", "clinicaltrials"])
|
|
51
|
+
ap.add_argument("--pmc-groups", nargs="+", default=["oa_comm", "oa_noncomm", "oa_other"],
|
|
52
|
+
choices=["oa_comm", "oa_noncomm", "oa_other"])
|
|
53
|
+
ap.add_argument("--pmc-formats", nargs="+", default=["xml"],
|
|
54
|
+
choices=["xml", "txt"])
|
|
55
|
+
ap.add_argument("--fda-endpoints", nargs="+", default=None,
|
|
56
|
+
help="openFDA endpoints to mirror, e.g. 'drug/event drug/label'; default: all")
|
|
57
|
+
ap.add_argument("--workers", type=int, default=4)
|
|
58
|
+
ap.add_argument("--max-retries", type=int, default=5)
|
|
59
|
+
ap.add_argument("--timeout", type=int, default=60)
|
|
60
|
+
ap.add_argument("--dry-run", action="store_true",
|
|
61
|
+
help="plan only, download nothing")
|
|
62
|
+
ap.add_argument("--reverify", action="store_true",
|
|
63
|
+
help="re-download already-downloaded files to verify integrity")
|
|
64
|
+
ap.add_argument("--prune", action="store_true",
|
|
65
|
+
help="delete local files no longer present on the server")
|
|
66
|
+
ap.add_argument("--count-articles", action="store_true",
|
|
67
|
+
help="count articles in already-downloaded local files and exit "
|
|
68
|
+
"(no network); backfills the per-source article totals")
|
|
69
|
+
ap.add_argument("--no-rich", action="store_true",
|
|
70
|
+
help="disable Rich progress bars and use plain text output")
|
|
71
|
+
ap.add_argument("--verbose", "-v", action="store_true",
|
|
72
|
+
help="enable debug logging")
|
|
73
|
+
a = ap.parse_args(argv)
|
|
74
|
+
if not a.email:
|
|
75
|
+
ap.error("provide --email or set NCBI_EMAIL")
|
|
76
|
+
return Config(
|
|
77
|
+
data_root=a.data_root.expanduser().resolve(),
|
|
78
|
+
email=a.email,
|
|
79
|
+
sources=tuple(a.sources),
|
|
80
|
+
pmc_groups=tuple(a.pmc_groups),
|
|
81
|
+
pmc_formats=tuple(a.pmc_formats),
|
|
82
|
+
workers=max(1, a.workers),
|
|
83
|
+
max_retries=a.max_retries,
|
|
84
|
+
timeout=a.timeout,
|
|
85
|
+
dry_run=a.dry_run,
|
|
86
|
+
reverify=a.reverify,
|
|
87
|
+
prune=a.prune,
|
|
88
|
+
count_articles=a.count_articles,
|
|
89
|
+
fda_endpoints=tuple(a.fda_endpoints) if a.fda_endpoints else None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def sync_command(cfg: Config, ui: UI) -> int:
|
|
94
|
+
cfg.data_root.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
setup_logging(cfg.log_dir, verbose=False)
|
|
96
|
+
LOG.info("litsync starting | root=%s sources=%s dry_run=%s count_articles=%s",
|
|
97
|
+
cfg.data_root, cfg.sources, cfg.dry_run, cfg.count_articles)
|
|
98
|
+
with run_lock(cfg.lock_path):
|
|
99
|
+
syncer = Syncer(cfg, ui)
|
|
100
|
+
if cfg.count_articles:
|
|
101
|
+
return syncer.backfill_counts()
|
|
102
|
+
return syncer.run()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def extract_command(argv: Optional[list[str]] = None) -> int:
|
|
106
|
+
ap = argparse.ArgumentParser(
|
|
107
|
+
prog="litsync-extract",
|
|
108
|
+
description="Extract litsync mirror into sharded JSONL",
|
|
109
|
+
)
|
|
110
|
+
ap.add_argument("--data-root", type=Path, default=Path("./data/literature"))
|
|
111
|
+
ap.add_argument("--out", type=Path, default=Path("./data/corpus"))
|
|
112
|
+
ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
|
|
113
|
+
choices=["pubmed", "pmc", "fda", "clinicaltrials"])
|
|
114
|
+
ap.add_argument("--shard-size-mb", type=int, default=256)
|
|
115
|
+
ap.add_argument("--limit", type=int, default=None)
|
|
116
|
+
ap.add_argument("--verbose", "-v", action="store_true")
|
|
117
|
+
args = ap.parse_args(argv)
|
|
118
|
+
|
|
119
|
+
logging.basicConfig(
|
|
120
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
121
|
+
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
122
|
+
)
|
|
123
|
+
run_extraction(
|
|
124
|
+
args.data_root.expanduser().resolve(),
|
|
125
|
+
args.out.expanduser().resolve(),
|
|
126
|
+
args.sources,
|
|
127
|
+
args.shard_size_mb,
|
|
128
|
+
args.limit,
|
|
129
|
+
)
|
|
130
|
+
return 0
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
134
|
+
# Pre-scan argv for --no-rich so we can choose the UI before argparse runs.
|
|
135
|
+
raw = argv if argv is not None else sys.argv[1:]
|
|
136
|
+
use_plain = "--no-rich" in raw
|
|
137
|
+
cfg = parse_args(argv)
|
|
138
|
+
ui = UI() if use_plain else RichUI()
|
|
139
|
+
return sync_command(cfg, ui)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
if __name__ == "__main__":
|
|
143
|
+
raise SystemExit(main())
|
litsync/config.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEFAULT_WORKERS = 4
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class Config:
|
|
13
|
+
data_root: Path
|
|
14
|
+
email: str
|
|
15
|
+
sources: tuple[str, ...]
|
|
16
|
+
pmc_groups: tuple[str, ...]
|
|
17
|
+
pmc_formats: tuple[str, ...]
|
|
18
|
+
workers: int = DEFAULT_WORKERS
|
|
19
|
+
max_retries: int = 5
|
|
20
|
+
backoff_base: float = 2.0
|
|
21
|
+
timeout: int = 60
|
|
22
|
+
dry_run: bool = False
|
|
23
|
+
reverify: bool = False
|
|
24
|
+
prune: bool = False
|
|
25
|
+
count_articles: bool = False
|
|
26
|
+
fda_endpoints: Optional[tuple[str, ...]] = None
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def state_dir(self) -> Path:
|
|
30
|
+
return self.data_root / "_state"
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def db_path(self) -> Path:
|
|
34
|
+
return self.state_dir / "state.sqlite"
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def lock_path(self) -> Path:
|
|
38
|
+
return self.state_dir / "litsync.lock"
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def log_dir(self) -> Path:
|
|
42
|
+
return self.state_dir / "logs"
|
litsync/extract.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
import tarfile
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Iterator, Optional
|
|
12
|
+
from xml.etree import ElementTree as ET
|
|
13
|
+
|
|
14
|
+
LOG = logging.getLogger("litsync")
|
|
15
|
+
_WS = re.compile(r"\s+")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def clean(s: Optional[str]) -> str:
|
|
19
|
+
return _WS.sub(" ", s).strip() if s else ""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def local_tag(tag: str) -> str:
|
|
23
|
+
return tag.rsplit("}", 1)[-1]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def text(el: Optional[ET.Element]) -> str:
|
|
27
|
+
if el is None:
|
|
28
|
+
return ""
|
|
29
|
+
return clean(" ".join(el.itertext()))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def find_local(parent: ET.Element, name: str) -> Optional[ET.Element]:
|
|
33
|
+
for el in parent.iter():
|
|
34
|
+
if local_tag(el.tag) == name:
|
|
35
|
+
return el
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def findall_local(parent: ET.Element, name: str) -> list[ET.Element]:
|
|
40
|
+
return [el for el in parent.iter() if local_tag(el.tag) == name]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# --------------------------------------------------------------------------- #
|
|
44
|
+
# PubMed
|
|
45
|
+
# --------------------------------------------------------------------------- #
|
|
46
|
+
def pubmed_year(article: ET.Element) -> Optional[int]:
|
|
47
|
+
for pd in findall_local(article, "PubDate"):
|
|
48
|
+
y = pd.find("Year")
|
|
49
|
+
if y is not None and y.text and y.text.strip().isdigit():
|
|
50
|
+
return int(y.text.strip())
|
|
51
|
+
md = pd.find("MedlineDate")
|
|
52
|
+
if md is not None and md.text:
|
|
53
|
+
m = re.search(r"\d{4}", md.text)
|
|
54
|
+
if m:
|
|
55
|
+
return int(m.group())
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_pubmed(raw: bytes, source_file: str) -> Iterator[dict]:
|
|
60
|
+
for _, elem in ET.iterparse(io.BytesIO(raw), events=("end",)):
|
|
61
|
+
if local_tag(elem.tag) != "PubmedArticle":
|
|
62
|
+
continue
|
|
63
|
+
try:
|
|
64
|
+
pmid_el = find_local(elem, "PMID")
|
|
65
|
+
title = text(find_local(elem, "ArticleTitle"))
|
|
66
|
+
abstract = " ".join(text(a) for a in findall_local(elem, "AbstractText")).strip()
|
|
67
|
+
journal = ""
|
|
68
|
+
jt = find_local(elem, "Journal")
|
|
69
|
+
if jt is not None:
|
|
70
|
+
journal = text(jt.find("Title"))
|
|
71
|
+
authors = []
|
|
72
|
+
for au in findall_local(elem, "Author"):
|
|
73
|
+
last = au.findtext("LastName") or ""
|
|
74
|
+
init = au.findtext("Initials") or ""
|
|
75
|
+
coll = au.findtext("CollectiveName") or ""
|
|
76
|
+
name = clean(f"{last} {init}".strip() or coll)
|
|
77
|
+
if name:
|
|
78
|
+
authors.append(name)
|
|
79
|
+
mesh = [text(d) for d in findall_local(elem, "DescriptorName") if text(d)]
|
|
80
|
+
keywords = [text(k) for k in findall_local(elem, "Keyword") if text(k)]
|
|
81
|
+
doi = None
|
|
82
|
+
for aid in findall_local(elem, "ArticleId"):
|
|
83
|
+
if aid.get("IdType") == "doi" and aid.text:
|
|
84
|
+
doi = aid.text.strip()
|
|
85
|
+
break
|
|
86
|
+
yield {
|
|
87
|
+
"source": "pubmed",
|
|
88
|
+
"pmid": pmid_el.text.strip() if pmid_el is not None and pmid_el.text else None,
|
|
89
|
+
"pmcid": None,
|
|
90
|
+
"doi": doi,
|
|
91
|
+
"title": title,
|
|
92
|
+
"abstract": clean(abstract),
|
|
93
|
+
"body": "",
|
|
94
|
+
"journal": journal,
|
|
95
|
+
"year": pubmed_year(elem),
|
|
96
|
+
"authors": authors,
|
|
97
|
+
"mesh": mesh,
|
|
98
|
+
"keywords": keywords,
|
|
99
|
+
"source_file": source_file,
|
|
100
|
+
}
|
|
101
|
+
finally:
|
|
102
|
+
elem.clear()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# --------------------------------------------------------------------------- #
|
|
106
|
+
# PMC
|
|
107
|
+
# --------------------------------------------------------------------------- #
|
|
108
|
+
def parse_pmc_article(raw: bytes, source_file: str) -> Optional[dict]:
|
|
109
|
+
try:
|
|
110
|
+
root = ET.fromstring(raw)
|
|
111
|
+
except ET.ParseError:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
ids = {}
|
|
115
|
+
for aid in findall_local(root, "article-id"):
|
|
116
|
+
t = aid.get("pub-id-type") or aid.get("{http://www.w3.org/1999/xlink}type")
|
|
117
|
+
if t and aid.text:
|
|
118
|
+
ids[t] = aid.text.strip()
|
|
119
|
+
|
|
120
|
+
pmcid = ids.get("pmc") or ids.get("pmcid")
|
|
121
|
+
if pmcid and not pmcid.upper().startswith("PMC"):
|
|
122
|
+
pmcid = "PMC" + pmcid
|
|
123
|
+
|
|
124
|
+
title = text(find_local(root, "article-title"))
|
|
125
|
+
abstract = " ".join(text(a) for a in findall_local(root, "abstract")).strip()
|
|
126
|
+
journal = text(find_local(root, "journal-title"))
|
|
127
|
+
|
|
128
|
+
year = None
|
|
129
|
+
for pd in findall_local(root, "pub-date"):
|
|
130
|
+
y = pd.find("year")
|
|
131
|
+
if y is not None and y.text and y.text.strip().isdigit():
|
|
132
|
+
year = int(y.text.strip())
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
authors = []
|
|
136
|
+
for contrib in findall_local(root, "contrib"):
|
|
137
|
+
if contrib.get("contrib-type") not in (None, "author"):
|
|
138
|
+
continue
|
|
139
|
+
name = find_local(contrib, "name")
|
|
140
|
+
if name is not None:
|
|
141
|
+
sur = name.findtext("surname") or ""
|
|
142
|
+
giv = name.findtext("given-names") or ""
|
|
143
|
+
full = clean(f"{sur} {giv}".strip())
|
|
144
|
+
if full:
|
|
145
|
+
authors.append(full)
|
|
146
|
+
|
|
147
|
+
body_el = find_local(root, "body")
|
|
148
|
+
body = text(body_el)
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
"source": "pmc",
|
|
152
|
+
"pmid": ids.get("pmid"),
|
|
153
|
+
"pmcid": pmcid,
|
|
154
|
+
"doi": ids.get("doi"),
|
|
155
|
+
"title": title,
|
|
156
|
+
"abstract": clean(abstract),
|
|
157
|
+
"body": body,
|
|
158
|
+
"journal": journal,
|
|
159
|
+
"year": year,
|
|
160
|
+
"authors": authors,
|
|
161
|
+
"mesh": [],
|
|
162
|
+
"keywords": [text(k) for k in findall_local(root, "kwd") if text(k)],
|
|
163
|
+
"source_file": source_file,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def parse_pmc_tar(path: Path, rel: str) -> Iterator[dict]:
|
|
168
|
+
with tarfile.open(path, mode="r|gz") as tar:
|
|
169
|
+
for member in tar:
|
|
170
|
+
if not member.isfile():
|
|
171
|
+
continue
|
|
172
|
+
low = member.name.lower()
|
|
173
|
+
if not (low.endswith(".xml") or low.endswith(".nxml")):
|
|
174
|
+
continue
|
|
175
|
+
f = tar.extractfile(member)
|
|
176
|
+
if f is None:
|
|
177
|
+
continue
|
|
178
|
+
rec = parse_pmc_article(f.read(), f"{rel}::{member.name}")
|
|
179
|
+
if rec is not None:
|
|
180
|
+
yield rec
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# --------------------------------------------------------------------------- #
|
|
184
|
+
# openFDA
|
|
185
|
+
# --------------------------------------------------------------------------- #
|
|
186
|
+
def _flatten_fda_value(v):
|
|
187
|
+
if isinstance(v, str):
|
|
188
|
+
return v
|
|
189
|
+
if isinstance(v, (list, tuple)):
|
|
190
|
+
return " ".join(_flatten_fda_value(x) for x in v)
|
|
191
|
+
if isinstance(v, dict):
|
|
192
|
+
return " ".join(_flatten_fda_value(x) for x in v.values())
|
|
193
|
+
return ""
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def parse_fda_json_file(path: Path, rel: str) -> Iterator[dict]:
|
|
197
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
198
|
+
results = data.get("results", [])
|
|
199
|
+
parts = Path(rel).parts
|
|
200
|
+
endpoint = "/".join(parts[1:3]) if len(parts) >= 3 else ""
|
|
201
|
+
meta = data.get("meta", {})
|
|
202
|
+
last_updated = meta.get("last_updated")
|
|
203
|
+
for rec in results:
|
|
204
|
+
text = clean(_flatten_fda_value(rec))
|
|
205
|
+
rid = rec.get("safetyreportid") or rec.get("set_id") or rec.get("id")
|
|
206
|
+
if not rid and isinstance(rec, dict):
|
|
207
|
+
for v in rec.values():
|
|
208
|
+
if isinstance(v, str):
|
|
209
|
+
rid = v
|
|
210
|
+
break
|
|
211
|
+
yield {
|
|
212
|
+
"source": "fda",
|
|
213
|
+
"fda_endpoint": endpoint,
|
|
214
|
+
"id": str(rid) if rid else None,
|
|
215
|
+
"title": "",
|
|
216
|
+
"abstract": "",
|
|
217
|
+
"body": text,
|
|
218
|
+
"journal": "",
|
|
219
|
+
"year": None,
|
|
220
|
+
"authors": [],
|
|
221
|
+
"mesh": [],
|
|
222
|
+
"keywords": [],
|
|
223
|
+
"source_file": rel,
|
|
224
|
+
"last_updated": last_updated,
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# --------------------------------------------------------------------------- #
|
|
229
|
+
# ClinicalTrials
|
|
230
|
+
# --------------------------------------------------------------------------- #
|
|
231
|
+
def parse_clinicaltrials_xml_file(path: Path, rel: str) -> Optional[dict]:
|
|
232
|
+
try:
|
|
233
|
+
root = ET.fromstring(path.read_bytes())
|
|
234
|
+
except ET.ParseError:
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
def find(tag: str) -> Optional[ET.Element]:
|
|
238
|
+
return find_local(root, tag)
|
|
239
|
+
|
|
240
|
+
def findall(tag: str) -> list[ET.Element]:
|
|
241
|
+
return findall_local(root, tag)
|
|
242
|
+
|
|
243
|
+
nct_id = text(find("nct_id")) or text(find("nctId"))
|
|
244
|
+
title = text(find("official_title")) or text(find("brief_title"))
|
|
245
|
+
brief_summary = text(find("brief_summary"))
|
|
246
|
+
detailed_description = text(find("detailed_description"))
|
|
247
|
+
eligibility = text(find("eligibility"))
|
|
248
|
+
conditions = [text(c) for c in findall("condition")]
|
|
249
|
+
interventions = [text(i) for i in findall("intervention")]
|
|
250
|
+
phases = [text(p) for p in findall("phase")]
|
|
251
|
+
statuses = [text(s) for s in findall("overall_status")]
|
|
252
|
+
|
|
253
|
+
year = None
|
|
254
|
+
for dtag in ("start_date", "completion_date", "verification_date", "study_first_submitted"):
|
|
255
|
+
d = text(find(dtag))
|
|
256
|
+
if d:
|
|
257
|
+
m = re.search(r"\d{4}", d)
|
|
258
|
+
if m:
|
|
259
|
+
year = int(m.group())
|
|
260
|
+
break
|
|
261
|
+
|
|
262
|
+
body = "\n\n".join(filter(None, [
|
|
263
|
+
brief_summary,
|
|
264
|
+
detailed_description,
|
|
265
|
+
eligibility,
|
|
266
|
+
"Conditions: " + ", ".join(conditions) if conditions else "",
|
|
267
|
+
"Interventions: " + ", ".join(interventions) if interventions else "",
|
|
268
|
+
]))
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
"source": "clinicaltrials",
|
|
272
|
+
"nct_id": nct_id,
|
|
273
|
+
"pmid": None,
|
|
274
|
+
"pmcid": None,
|
|
275
|
+
"doi": None,
|
|
276
|
+
"title": title,
|
|
277
|
+
"abstract": clean(brief_summary),
|
|
278
|
+
"body": clean(body),
|
|
279
|
+
"journal": "",
|
|
280
|
+
"year": year,
|
|
281
|
+
"authors": [],
|
|
282
|
+
"mesh": [],
|
|
283
|
+
"keywords": conditions + interventions,
|
|
284
|
+
"source_file": rel,
|
|
285
|
+
"phase": phases[0] if phases else None,
|
|
286
|
+
"overall_status": statuses[0] if statuses else None,
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# --------------------------------------------------------------------------- #
|
|
291
|
+
# Sharded JSONL writer
|
|
292
|
+
# --------------------------------------------------------------------------- #
|
|
293
|
+
class ShardWriter:
|
|
294
|
+
def __init__(self, out_dir: Path, shard_bytes: int, prefix: str = "corpus"):
|
|
295
|
+
self.out_dir = out_dir
|
|
296
|
+
self.shard_bytes = shard_bytes
|
|
297
|
+
self.prefix = prefix
|
|
298
|
+
self.idx = 0
|
|
299
|
+
self.bytes = 0
|
|
300
|
+
self.records = 0
|
|
301
|
+
self.fh = None
|
|
302
|
+
self._roll()
|
|
303
|
+
|
|
304
|
+
def _roll(self):
|
|
305
|
+
if self.fh:
|
|
306
|
+
self.fh.close()
|
|
307
|
+
self.idx += 1
|
|
308
|
+
self.bytes = 0
|
|
309
|
+
name = f"{self.prefix}-{self.idx:05d}.jsonl"
|
|
310
|
+
self.fh = open(self.out_dir / name, "w", encoding="utf-8")
|
|
311
|
+
LOG.info("writing shard %s", name)
|
|
312
|
+
|
|
313
|
+
def write(self, rec: dict):
|
|
314
|
+
line = json.dumps(rec, ensure_ascii=False) + "\n"
|
|
315
|
+
data = line.encode("utf-8")
|
|
316
|
+
if self.bytes and self.bytes + len(data) > self.shard_bytes:
|
|
317
|
+
self._roll()
|
|
318
|
+
self.fh.write(line)
|
|
319
|
+
self.bytes += len(data)
|
|
320
|
+
self.records += 1
|
|
321
|
+
|
|
322
|
+
def close(self):
|
|
323
|
+
if self.fh:
|
|
324
|
+
self.fh.close()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def iter_source_files(data_root: Path, sources: list[str], limit: Optional[int]):
|
|
328
|
+
if "pubmed" in sources:
|
|
329
|
+
files = sorted(data_root.glob("pubmed/**/*.xml.gz"))
|
|
330
|
+
for p in files[: limit if limit else None]:
|
|
331
|
+
yield "pubmed", p
|
|
332
|
+
if "pmc" in sources:
|
|
333
|
+
files = sorted(data_root.glob("pmc/**/*.tar.gz"))
|
|
334
|
+
for p in files[: limit if limit else None]:
|
|
335
|
+
yield "pmc", p
|
|
336
|
+
if "fda" in sources:
|
|
337
|
+
files = sorted(data_root.glob("fda/**/*/*.json"))
|
|
338
|
+
for p in files[: limit if limit else None]:
|
|
339
|
+
yield "fda", p
|
|
340
|
+
if "clinicaltrials" in sources:
|
|
341
|
+
files = sorted(data_root.glob("clinicaltrials/**/*.xml"))
|
|
342
|
+
for p in files[: limit if limit else None]:
|
|
343
|
+
yield "clinicaltrials", p
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def run_extraction(data_root: Path, out_dir: Path, sources: list[str],
|
|
347
|
+
shard_size_mb: int, limit: Optional[int]) -> dict:
|
|
348
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
349
|
+
writer = ShardWriter(out_dir, shard_size_mb * 1024 * 1024)
|
|
350
|
+
|
|
351
|
+
stats = {"pubmed_files": 0, "pmc_files": 0, "fda_files": 0, "clinicaltrials_files": 0,
|
|
352
|
+
"pubmed_records": 0, "pmc_records": 0, "fda_records": 0,
|
|
353
|
+
"clinicaltrials_records": 0, "errors": 0}
|
|
354
|
+
t0 = time.time()
|
|
355
|
+
|
|
356
|
+
for kind, path in iter_source_files(data_root, sources, limit):
|
|
357
|
+
rel = str(path.relative_to(data_root))
|
|
358
|
+
LOG.info("processing %s (%s)", rel, kind)
|
|
359
|
+
try:
|
|
360
|
+
if kind == "pubmed":
|
|
361
|
+
raw = gzip.decompress(path.read_bytes())
|
|
362
|
+
n = 0
|
|
363
|
+
for rec in parse_pubmed(raw, rel):
|
|
364
|
+
writer.write(rec)
|
|
365
|
+
n += 1
|
|
366
|
+
stats["pubmed_records"] += n
|
|
367
|
+
stats["pubmed_files"] += 1
|
|
368
|
+
elif kind == "pmc":
|
|
369
|
+
n = 0
|
|
370
|
+
for rec in parse_pmc_tar(path, rel):
|
|
371
|
+
writer.write(rec)
|
|
372
|
+
n += 1
|
|
373
|
+
if n % 5000 == 0:
|
|
374
|
+
LOG.info(" ... %d articles from %s", n, rel)
|
|
375
|
+
stats["pmc_records"] += n
|
|
376
|
+
stats["pmc_files"] += 1
|
|
377
|
+
elif kind == "fda":
|
|
378
|
+
n = 0
|
|
379
|
+
for rec in parse_fda_json_file(path, rel):
|
|
380
|
+
writer.write(rec)
|
|
381
|
+
n += 1
|
|
382
|
+
stats["fda_records"] += n
|
|
383
|
+
stats["fda_files"] += 1
|
|
384
|
+
elif kind == "clinicaltrials":
|
|
385
|
+
rec = parse_clinicaltrials_xml_file(path, rel)
|
|
386
|
+
n = 1 if rec else 0
|
|
387
|
+
if rec:
|
|
388
|
+
writer.write(rec)
|
|
389
|
+
stats["clinicaltrials_records"] += n
|
|
390
|
+
stats["clinicaltrials_files"] += 1
|
|
391
|
+
LOG.info(" -> %d records (running total %d)", n, writer.records)
|
|
392
|
+
except Exception as exc:
|
|
393
|
+
stats["errors"] += 1
|
|
394
|
+
LOG.error("failed on %s: %s", rel, exc)
|
|
395
|
+
|
|
396
|
+
writer.close()
|
|
397
|
+
elapsed = time.time() - t0
|
|
398
|
+
|
|
399
|
+
manifest = {
|
|
400
|
+
**stats,
|
|
401
|
+
"total_records": writer.records,
|
|
402
|
+
"shards": writer.idx,
|
|
403
|
+
"shard_size_mb": shard_size_mb,
|
|
404
|
+
"out_dir": str(out_dir),
|
|
405
|
+
"elapsed_sec": round(elapsed, 1),
|
|
406
|
+
}
|
|
407
|
+
(out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
|
|
408
|
+
LOG.info("DONE %s", json.dumps(manifest))
|
|
409
|
+
return manifest
|