litsync 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litsync/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """litsync — incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov."""
2
+
3
+ __version__ = "0.0.2"
litsync/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from litsync.cli import main
2
+
3
+ raise SystemExit(main())
litsync/cli.py ADDED
@@ -0,0 +1,143 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from rich.logging import RichHandler
11
+
12
+ from litsync.config import Config
13
+ from litsync.extract import run_extraction
14
+ from litsync.sync import Syncer
15
+ from litsync.ui import RichUI, UI
16
+ from litsync.utils import run_lock
17
+
18
+
19
+ LOG = logging.getLogger("litsync")
20
+
21
+
22
+ def setup_logging(log_dir: Path, verbose: bool = False) -> None:
23
+ log_dir.mkdir(parents=True, exist_ok=True)
24
+ logfile = log_dir / f"litsync_{__import__('datetime').date.today().isoformat()}.log"
25
+
26
+ file_handler = logging.FileHandler(logfile)
27
+ file_handler.setFormatter(logging.Formatter(
28
+ "%(asctime)s %(levelname)-7s %(message)s"
29
+ ))
30
+
31
+ rich_handler = RichHandler(rich_tracebacks=True, show_path=False)
32
+ rich_handler.setFormatter(logging.Formatter("%(message)s"))
33
+
34
+ logging.basicConfig(
35
+ level=logging.DEBUG if verbose else logging.INFO,
36
+ handlers=[rich_handler, file_handler],
37
+ )
38
+
39
+
40
+ def parse_args(argv: Optional[list[str]] = None) -> Config:
41
+ ap = argparse.ArgumentParser(
42
+ prog="litsync",
43
+ description="Incremental mirror for PubMed, PMC, FDA, and ClinicalTrials.gov",
44
+ )
45
+ ap.add_argument("--data-root", required=True, type=Path,
46
+ help="root directory for the local mirror")
47
+ ap.add_argument("--email", default=os.environ.get("NCBI_EMAIL", ""),
48
+ help="contact email (sent in User-Agent)")
49
+ ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
50
+ choices=["pubmed", "pmc", "fda", "clinicaltrials"])
51
+ ap.add_argument("--pmc-groups", nargs="+", default=["oa_comm", "oa_noncomm", "oa_other"],
52
+ choices=["oa_comm", "oa_noncomm", "oa_other"])
53
+ ap.add_argument("--pmc-formats", nargs="+", default=["xml"],
54
+ choices=["xml", "txt"])
55
+ ap.add_argument("--fda-endpoints", nargs="+", default=None,
56
+ help="openFDA endpoints to mirror, e.g. 'drug/event drug/label'; default: all")
57
+ ap.add_argument("--workers", type=int, default=4)
58
+ ap.add_argument("--max-retries", type=int, default=5)
59
+ ap.add_argument("--timeout", type=int, default=60)
60
+ ap.add_argument("--dry-run", action="store_true",
61
+ help="plan only, download nothing")
62
+ ap.add_argument("--reverify", action="store_true",
63
+ help="re-download already-downloaded files to verify integrity")
64
+ ap.add_argument("--prune", action="store_true",
65
+ help="delete local files no longer present on the server")
66
+ ap.add_argument("--count-articles", action="store_true",
67
+ help="count articles in already-downloaded local files and exit "
68
+ "(no network); backfills the per-source article totals")
69
+ ap.add_argument("--no-rich", action="store_true",
70
+ help="disable Rich progress bars and use plain text output")
71
+ ap.add_argument("--verbose", "-v", action="store_true",
72
+ help="enable debug logging")
73
+ a = ap.parse_args(argv)
74
+ if not a.email:
75
+ ap.error("provide --email or set NCBI_EMAIL")
76
+ return Config(
77
+ data_root=a.data_root.expanduser().resolve(),
78
+ email=a.email,
79
+ sources=tuple(a.sources),
80
+ pmc_groups=tuple(a.pmc_groups),
81
+ pmc_formats=tuple(a.pmc_formats),
82
+ workers=max(1, a.workers),
83
+ max_retries=a.max_retries,
84
+ timeout=a.timeout,
85
+ dry_run=a.dry_run,
86
+ reverify=a.reverify,
87
+ prune=a.prune,
88
+ count_articles=a.count_articles,
89
+ fda_endpoints=tuple(a.fda_endpoints) if a.fda_endpoints else None,
90
+ )
91
+
92
+
93
+ def sync_command(cfg: Config, ui: UI) -> int:
94
+ cfg.data_root.mkdir(parents=True, exist_ok=True)
95
+ setup_logging(cfg.log_dir, verbose=False)
96
+ LOG.info("litsync starting | root=%s sources=%s dry_run=%s count_articles=%s",
97
+ cfg.data_root, cfg.sources, cfg.dry_run, cfg.count_articles)
98
+ with run_lock(cfg.lock_path):
99
+ syncer = Syncer(cfg, ui)
100
+ if cfg.count_articles:
101
+ return syncer.backfill_counts()
102
+ return syncer.run()
103
+
104
+
105
+ def extract_command(argv: Optional[list[str]] = None) -> int:
106
+ ap = argparse.ArgumentParser(
107
+ prog="litsync-extract",
108
+ description="Extract litsync mirror into sharded JSONL",
109
+ )
110
+ ap.add_argument("--data-root", type=Path, default=Path("./data/literature"))
111
+ ap.add_argument("--out", type=Path, default=Path("./data/corpus"))
112
+ ap.add_argument("--sources", nargs="+", default=["pubmed", "pmc"],
113
+ choices=["pubmed", "pmc", "fda", "clinicaltrials"])
114
+ ap.add_argument("--shard-size-mb", type=int, default=256)
115
+ ap.add_argument("--limit", type=int, default=None)
116
+ ap.add_argument("--verbose", "-v", action="store_true")
117
+ args = ap.parse_args(argv)
118
+
119
+ logging.basicConfig(
120
+ level=logging.DEBUG if args.verbose else logging.INFO,
121
+ format="%(asctime)s %(levelname)-7s %(message)s",
122
+ )
123
+ run_extraction(
124
+ args.data_root.expanduser().resolve(),
125
+ args.out.expanduser().resolve(),
126
+ args.sources,
127
+ args.shard_size_mb,
128
+ args.limit,
129
+ )
130
+ return 0
131
+
132
+
133
+ def main(argv: Optional[list[str]] = None) -> int:
134
+ # Pre-scan argv for --no-rich so we can choose the UI before argparse runs.
135
+ raw = argv if argv is not None else sys.argv[1:]
136
+ use_plain = "--no-rich" in raw
137
+ cfg = parse_args(argv)
138
+ ui = UI() if use_plain else RichUI()
139
+ return sync_command(cfg, ui)
140
+
141
+
142
+ if __name__ == "__main__":
143
+ raise SystemExit(main())
litsync/config.py ADDED
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ DEFAULT_WORKERS = 4
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class Config:
13
+ data_root: Path
14
+ email: str
15
+ sources: tuple[str, ...]
16
+ pmc_groups: tuple[str, ...]
17
+ pmc_formats: tuple[str, ...]
18
+ workers: int = DEFAULT_WORKERS
19
+ max_retries: int = 5
20
+ backoff_base: float = 2.0
21
+ timeout: int = 60
22
+ dry_run: bool = False
23
+ reverify: bool = False
24
+ prune: bool = False
25
+ count_articles: bool = False
26
+ fda_endpoints: Optional[tuple[str, ...]] = None
27
+
28
+ @property
29
+ def state_dir(self) -> Path:
30
+ return self.data_root / "_state"
31
+
32
+ @property
33
+ def db_path(self) -> Path:
34
+ return self.state_dir / "state.sqlite"
35
+
36
+ @property
37
+ def lock_path(self) -> Path:
38
+ return self.state_dir / "litsync.lock"
39
+
40
+ @property
41
+ def log_dir(self) -> Path:
42
+ return self.state_dir / "logs"
litsync/extract.py ADDED
@@ -0,0 +1,409 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import io
5
+ import json
6
+ import logging
7
+ import re
8
+ import tarfile
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Iterator, Optional
12
+ from xml.etree import ElementTree as ET
13
+
14
+ LOG = logging.getLogger("litsync")
15
+ _WS = re.compile(r"\s+")
16
+
17
+
18
+ def clean(s: Optional[str]) -> str:
19
+ return _WS.sub(" ", s).strip() if s else ""
20
+
21
+
22
+ def local_tag(tag: str) -> str:
23
+ return tag.rsplit("}", 1)[-1]
24
+
25
+
26
+ def text(el: Optional[ET.Element]) -> str:
27
+ if el is None:
28
+ return ""
29
+ return clean(" ".join(el.itertext()))
30
+
31
+
32
+ def find_local(parent: ET.Element, name: str) -> Optional[ET.Element]:
33
+ for el in parent.iter():
34
+ if local_tag(el.tag) == name:
35
+ return el
36
+ return None
37
+
38
+
39
+ def findall_local(parent: ET.Element, name: str) -> list[ET.Element]:
40
+ return [el for el in parent.iter() if local_tag(el.tag) == name]
41
+
42
+
43
+ # --------------------------------------------------------------------------- #
44
+ # PubMed
45
+ # --------------------------------------------------------------------------- #
46
+ def pubmed_year(article: ET.Element) -> Optional[int]:
47
+ for pd in findall_local(article, "PubDate"):
48
+ y = pd.find("Year")
49
+ if y is not None and y.text and y.text.strip().isdigit():
50
+ return int(y.text.strip())
51
+ md = pd.find("MedlineDate")
52
+ if md is not None and md.text:
53
+ m = re.search(r"\d{4}", md.text)
54
+ if m:
55
+ return int(m.group())
56
+ return None
57
+
58
+
59
+ def parse_pubmed(raw: bytes, source_file: str) -> Iterator[dict]:
60
+ for _, elem in ET.iterparse(io.BytesIO(raw), events=("end",)):
61
+ if local_tag(elem.tag) != "PubmedArticle":
62
+ continue
63
+ try:
64
+ pmid_el = find_local(elem, "PMID")
65
+ title = text(find_local(elem, "ArticleTitle"))
66
+ abstract = " ".join(text(a) for a in findall_local(elem, "AbstractText")).strip()
67
+ journal = ""
68
+ jt = find_local(elem, "Journal")
69
+ if jt is not None:
70
+ journal = text(jt.find("Title"))
71
+ authors = []
72
+ for au in findall_local(elem, "Author"):
73
+ last = au.findtext("LastName") or ""
74
+ init = au.findtext("Initials") or ""
75
+ coll = au.findtext("CollectiveName") or ""
76
+ name = clean(f"{last} {init}".strip() or coll)
77
+ if name:
78
+ authors.append(name)
79
+ mesh = [text(d) for d in findall_local(elem, "DescriptorName") if text(d)]
80
+ keywords = [text(k) for k in findall_local(elem, "Keyword") if text(k)]
81
+ doi = None
82
+ for aid in findall_local(elem, "ArticleId"):
83
+ if aid.get("IdType") == "doi" and aid.text:
84
+ doi = aid.text.strip()
85
+ break
86
+ yield {
87
+ "source": "pubmed",
88
+ "pmid": pmid_el.text.strip() if pmid_el is not None and pmid_el.text else None,
89
+ "pmcid": None,
90
+ "doi": doi,
91
+ "title": title,
92
+ "abstract": clean(abstract),
93
+ "body": "",
94
+ "journal": journal,
95
+ "year": pubmed_year(elem),
96
+ "authors": authors,
97
+ "mesh": mesh,
98
+ "keywords": keywords,
99
+ "source_file": source_file,
100
+ }
101
+ finally:
102
+ elem.clear()
103
+
104
+
105
+ # --------------------------------------------------------------------------- #
106
+ # PMC
107
+ # --------------------------------------------------------------------------- #
108
+ def parse_pmc_article(raw: bytes, source_file: str) -> Optional[dict]:
109
+ try:
110
+ root = ET.fromstring(raw)
111
+ except ET.ParseError:
112
+ return None
113
+
114
+ ids = {}
115
+ for aid in findall_local(root, "article-id"):
116
+ t = aid.get("pub-id-type") or aid.get("{http://www.w3.org/1999/xlink}type")
117
+ if t and aid.text:
118
+ ids[t] = aid.text.strip()
119
+
120
+ pmcid = ids.get("pmc") or ids.get("pmcid")
121
+ if pmcid and not pmcid.upper().startswith("PMC"):
122
+ pmcid = "PMC" + pmcid
123
+
124
+ title = text(find_local(root, "article-title"))
125
+ abstract = " ".join(text(a) for a in findall_local(root, "abstract")).strip()
126
+ journal = text(find_local(root, "journal-title"))
127
+
128
+ year = None
129
+ for pd in findall_local(root, "pub-date"):
130
+ y = pd.find("year")
131
+ if y is not None and y.text and y.text.strip().isdigit():
132
+ year = int(y.text.strip())
133
+ break
134
+
135
+ authors = []
136
+ for contrib in findall_local(root, "contrib"):
137
+ if contrib.get("contrib-type") not in (None, "author"):
138
+ continue
139
+ name = find_local(contrib, "name")
140
+ if name is not None:
141
+ sur = name.findtext("surname") or ""
142
+ giv = name.findtext("given-names") or ""
143
+ full = clean(f"{sur} {giv}".strip())
144
+ if full:
145
+ authors.append(full)
146
+
147
+ body_el = find_local(root, "body")
148
+ body = text(body_el)
149
+
150
+ return {
151
+ "source": "pmc",
152
+ "pmid": ids.get("pmid"),
153
+ "pmcid": pmcid,
154
+ "doi": ids.get("doi"),
155
+ "title": title,
156
+ "abstract": clean(abstract),
157
+ "body": body,
158
+ "journal": journal,
159
+ "year": year,
160
+ "authors": authors,
161
+ "mesh": [],
162
+ "keywords": [text(k) for k in findall_local(root, "kwd") if text(k)],
163
+ "source_file": source_file,
164
+ }
165
+
166
+
167
+ def parse_pmc_tar(path: Path, rel: str) -> Iterator[dict]:
168
+ with tarfile.open(path, mode="r|gz") as tar:
169
+ for member in tar:
170
+ if not member.isfile():
171
+ continue
172
+ low = member.name.lower()
173
+ if not (low.endswith(".xml") or low.endswith(".nxml")):
174
+ continue
175
+ f = tar.extractfile(member)
176
+ if f is None:
177
+ continue
178
+ rec = parse_pmc_article(f.read(), f"{rel}::{member.name}")
179
+ if rec is not None:
180
+ yield rec
181
+
182
+
183
+ # --------------------------------------------------------------------------- #
184
+ # openFDA
185
+ # --------------------------------------------------------------------------- #
186
+ def _flatten_fda_value(v):
187
+ if isinstance(v, str):
188
+ return v
189
+ if isinstance(v, (list, tuple)):
190
+ return " ".join(_flatten_fda_value(x) for x in v)
191
+ if isinstance(v, dict):
192
+ return " ".join(_flatten_fda_value(x) for x in v.values())
193
+ return ""
194
+
195
+
196
+ def parse_fda_json_file(path: Path, rel: str) -> Iterator[dict]:
197
+ data = json.loads(path.read_text(encoding="utf-8"))
198
+ results = data.get("results", [])
199
+ parts = Path(rel).parts
200
+ endpoint = "/".join(parts[1:3]) if len(parts) >= 3 else ""
201
+ meta = data.get("meta", {})
202
+ last_updated = meta.get("last_updated")
203
+ for rec in results:
204
+ text = clean(_flatten_fda_value(rec))
205
+ rid = rec.get("safetyreportid") or rec.get("set_id") or rec.get("id")
206
+ if not rid and isinstance(rec, dict):
207
+ for v in rec.values():
208
+ if isinstance(v, str):
209
+ rid = v
210
+ break
211
+ yield {
212
+ "source": "fda",
213
+ "fda_endpoint": endpoint,
214
+ "id": str(rid) if rid else None,
215
+ "title": "",
216
+ "abstract": "",
217
+ "body": text,
218
+ "journal": "",
219
+ "year": None,
220
+ "authors": [],
221
+ "mesh": [],
222
+ "keywords": [],
223
+ "source_file": rel,
224
+ "last_updated": last_updated,
225
+ }
226
+
227
+
228
+ # --------------------------------------------------------------------------- #
229
+ # ClinicalTrials
230
+ # --------------------------------------------------------------------------- #
231
+ def parse_clinicaltrials_xml_file(path: Path, rel: str) -> Optional[dict]:
232
+ try:
233
+ root = ET.fromstring(path.read_bytes())
234
+ except ET.ParseError:
235
+ return None
236
+
237
+ def find(tag: str) -> Optional[ET.Element]:
238
+ return find_local(root, tag)
239
+
240
+ def findall(tag: str) -> list[ET.Element]:
241
+ return findall_local(root, tag)
242
+
243
+ nct_id = text(find("nct_id")) or text(find("nctId"))
244
+ title = text(find("official_title")) or text(find("brief_title"))
245
+ brief_summary = text(find("brief_summary"))
246
+ detailed_description = text(find("detailed_description"))
247
+ eligibility = text(find("eligibility"))
248
+ conditions = [text(c) for c in findall("condition")]
249
+ interventions = [text(i) for i in findall("intervention")]
250
+ phases = [text(p) for p in findall("phase")]
251
+ statuses = [text(s) for s in findall("overall_status")]
252
+
253
+ year = None
254
+ for dtag in ("start_date", "completion_date", "verification_date", "study_first_submitted"):
255
+ d = text(find(dtag))
256
+ if d:
257
+ m = re.search(r"\d{4}", d)
258
+ if m:
259
+ year = int(m.group())
260
+ break
261
+
262
+ body = "\n\n".join(filter(None, [
263
+ brief_summary,
264
+ detailed_description,
265
+ eligibility,
266
+ "Conditions: " + ", ".join(conditions) if conditions else "",
267
+ "Interventions: " + ", ".join(interventions) if interventions else "",
268
+ ]))
269
+
270
+ return {
271
+ "source": "clinicaltrials",
272
+ "nct_id": nct_id,
273
+ "pmid": None,
274
+ "pmcid": None,
275
+ "doi": None,
276
+ "title": title,
277
+ "abstract": clean(brief_summary),
278
+ "body": clean(body),
279
+ "journal": "",
280
+ "year": year,
281
+ "authors": [],
282
+ "mesh": [],
283
+ "keywords": conditions + interventions,
284
+ "source_file": rel,
285
+ "phase": phases[0] if phases else None,
286
+ "overall_status": statuses[0] if statuses else None,
287
+ }
288
+
289
+
290
+ # --------------------------------------------------------------------------- #
291
+ # Sharded JSONL writer
292
+ # --------------------------------------------------------------------------- #
293
+ class ShardWriter:
294
+ def __init__(self, out_dir: Path, shard_bytes: int, prefix: str = "corpus"):
295
+ self.out_dir = out_dir
296
+ self.shard_bytes = shard_bytes
297
+ self.prefix = prefix
298
+ self.idx = 0
299
+ self.bytes = 0
300
+ self.records = 0
301
+ self.fh = None
302
+ self._roll()
303
+
304
+ def _roll(self):
305
+ if self.fh:
306
+ self.fh.close()
307
+ self.idx += 1
308
+ self.bytes = 0
309
+ name = f"{self.prefix}-{self.idx:05d}.jsonl"
310
+ self.fh = open(self.out_dir / name, "w", encoding="utf-8")
311
+ LOG.info("writing shard %s", name)
312
+
313
+ def write(self, rec: dict):
314
+ line = json.dumps(rec, ensure_ascii=False) + "\n"
315
+ data = line.encode("utf-8")
316
+ if self.bytes and self.bytes + len(data) > self.shard_bytes:
317
+ self._roll()
318
+ self.fh.write(line)
319
+ self.bytes += len(data)
320
+ self.records += 1
321
+
322
+ def close(self):
323
+ if self.fh:
324
+ self.fh.close()
325
+
326
+
327
+ def iter_source_files(data_root: Path, sources: list[str], limit: Optional[int]):
328
+ if "pubmed" in sources:
329
+ files = sorted(data_root.glob("pubmed/**/*.xml.gz"))
330
+ for p in files[: limit if limit else None]:
331
+ yield "pubmed", p
332
+ if "pmc" in sources:
333
+ files = sorted(data_root.glob("pmc/**/*.tar.gz"))
334
+ for p in files[: limit if limit else None]:
335
+ yield "pmc", p
336
+ if "fda" in sources:
337
+ files = sorted(data_root.glob("fda/**/*/*.json"))
338
+ for p in files[: limit if limit else None]:
339
+ yield "fda", p
340
+ if "clinicaltrials" in sources:
341
+ files = sorted(data_root.glob("clinicaltrials/**/*.xml"))
342
+ for p in files[: limit if limit else None]:
343
+ yield "clinicaltrials", p
344
+
345
+
346
+ def run_extraction(data_root: Path, out_dir: Path, sources: list[str],
347
+ shard_size_mb: int, limit: Optional[int]) -> dict:
348
+ out_dir.mkdir(parents=True, exist_ok=True)
349
+ writer = ShardWriter(out_dir, shard_size_mb * 1024 * 1024)
350
+
351
+ stats = {"pubmed_files": 0, "pmc_files": 0, "fda_files": 0, "clinicaltrials_files": 0,
352
+ "pubmed_records": 0, "pmc_records": 0, "fda_records": 0,
353
+ "clinicaltrials_records": 0, "errors": 0}
354
+ t0 = time.time()
355
+
356
+ for kind, path in iter_source_files(data_root, sources, limit):
357
+ rel = str(path.relative_to(data_root))
358
+ LOG.info("processing %s (%s)", rel, kind)
359
+ try:
360
+ if kind == "pubmed":
361
+ raw = gzip.decompress(path.read_bytes())
362
+ n = 0
363
+ for rec in parse_pubmed(raw, rel):
364
+ writer.write(rec)
365
+ n += 1
366
+ stats["pubmed_records"] += n
367
+ stats["pubmed_files"] += 1
368
+ elif kind == "pmc":
369
+ n = 0
370
+ for rec in parse_pmc_tar(path, rel):
371
+ writer.write(rec)
372
+ n += 1
373
+ if n % 5000 == 0:
374
+ LOG.info(" ... %d articles from %s", n, rel)
375
+ stats["pmc_records"] += n
376
+ stats["pmc_files"] += 1
377
+ elif kind == "fda":
378
+ n = 0
379
+ for rec in parse_fda_json_file(path, rel):
380
+ writer.write(rec)
381
+ n += 1
382
+ stats["fda_records"] += n
383
+ stats["fda_files"] += 1
384
+ elif kind == "clinicaltrials":
385
+ rec = parse_clinicaltrials_xml_file(path, rel)
386
+ n = 1 if rec else 0
387
+ if rec:
388
+ writer.write(rec)
389
+ stats["clinicaltrials_records"] += n
390
+ stats["clinicaltrials_files"] += 1
391
+ LOG.info(" -> %d records (running total %d)", n, writer.records)
392
+ except Exception as exc:
393
+ stats["errors"] += 1
394
+ LOG.error("failed on %s: %s", rel, exc)
395
+
396
+ writer.close()
397
+ elapsed = time.time() - t0
398
+
399
+ manifest = {
400
+ **stats,
401
+ "total_records": writer.records,
402
+ "shards": writer.idx,
403
+ "shard_size_mb": shard_size_mb,
404
+ "out_dir": str(out_dir),
405
+ "elapsed_sec": round(elapsed, 1),
406
+ }
407
+ (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
408
+ LOG.info("DONE %s", json.dumps(manifest))
409
+ return manifest