litsync 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litsync/__init__.py +3 -0
- litsync/__main__.py +3 -0
- litsync/cli.py +143 -0
- litsync/config.py +42 -0
- litsync/extract.py +409 -0
- litsync/http.py +127 -0
- litsync/sources/__init__.py +17 -0
- litsync/sources/clinicaltrials.py +28 -0
- litsync/sources/fda.py +48 -0
- litsync/sources/pmc.py +51 -0
- litsync/sources/pubmed.py +37 -0
- litsync/state.py +147 -0
- litsync/sync.py +334 -0
- litsync/ui.py +232 -0
- litsync/utils.py +122 -0
- litsync-0.0.2.dist-info/METADATA +125 -0
- litsync-0.0.2.dist-info/RECORD +20 -0
- litsync-0.0.2.dist-info/WHEEL +5 -0
- litsync-0.0.2.dist-info/entry_points.txt +3 -0
- litsync-0.0.2.dist-info/top_level.txt +1 -0
litsync/sync.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import datetime as dt
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import shutil
|
|
8
|
+
import threading
|
|
9
|
+
import zipfile
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from litsync.config import Config
|
|
15
|
+
from litsync.http import HttpClient
|
|
16
|
+
from litsync.sources import Task
|
|
17
|
+
from litsync.sources.clinicaltrials import ClinicalTrialsSource
|
|
18
|
+
from litsync.sources.fda import FdaSource
|
|
19
|
+
from litsync.sources.pmc import PmcSource
|
|
20
|
+
from litsync.sources.pubmed import PubMedSource
|
|
21
|
+
from litsync.state import FileRecord, StateDB
|
|
22
|
+
from litsync.ui import UI
|
|
23
|
+
from litsync.utils import count_articles, human_bytes, md5_file, new_src_stats, utcnow
|
|
24
|
+
|
|
25
|
+
LOG = logging.getLogger("litsync")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Syncer:
|
|
29
|
+
def __init__(self, cfg: Config, ui: UI):
|
|
30
|
+
self.cfg = cfg
|
|
31
|
+
self.ui = ui
|
|
32
|
+
self.http = HttpClient(cfg)
|
|
33
|
+
self.db = StateDB(cfg.db_path)
|
|
34
|
+
self._stats_lock = threading.Lock()
|
|
35
|
+
self.stats = {"skipped": 0, "downloaded": 0, "verified": 0, "failed": 0}
|
|
36
|
+
self.bytes_downloaded = 0
|
|
37
|
+
self.articles_downloaded = 0
|
|
38
|
+
self.per_source: dict[str, dict] = {}
|
|
39
|
+
self.source_urls: dict[str, str] = {}
|
|
40
|
+
self.started_at = utcnow()
|
|
41
|
+
|
|
42
|
+
# ---- per-file decision logic ---------------------------------------- #
|
|
43
|
+
|
|
44
|
+
def _needs_work(self, task: Task) -> tuple[bool, Optional[int]]:
|
|
45
|
+
row = self.db.get(task.source, task.filename)
|
|
46
|
+
on_disk = task.dest.exists()
|
|
47
|
+
|
|
48
|
+
if (row is not None and row["status"] == "verified" and on_disk
|
|
49
|
+
and not self.cfg.reverify and task.immutable):
|
|
50
|
+
recorded = row["remote_size"]
|
|
51
|
+
if recorded is None or task.dest.stat().st_size == recorded:
|
|
52
|
+
return False, recorded
|
|
53
|
+
|
|
54
|
+
size, mtime, etag = self.http.head(task.url)
|
|
55
|
+
self.db.mark(task.source, task.filename,
|
|
56
|
+
remote_size=size, remote_mtime=mtime, etag=etag)
|
|
57
|
+
|
|
58
|
+
if (row is not None and on_disk and not self.cfg.reverify
|
|
59
|
+
and row["status"] in ("done", "verified")):
|
|
60
|
+
same_size = size is None or task.dest.stat().st_size == size
|
|
61
|
+
same_etag = etag is None or row["etag"] == etag
|
|
62
|
+
if same_size and same_etag:
|
|
63
|
+
return False, size
|
|
64
|
+
|
|
65
|
+
return True, size
|
|
66
|
+
|
|
67
|
+
# ---- worker --------------------------------------------------------- #
|
|
68
|
+
|
|
69
|
+
def _process(self, task: Task, download_progress) -> None:
|
|
70
|
+
is_new = self.db.get(task.source, task.filename) is None
|
|
71
|
+
self.db.upsert_seen(FileRecord(task.source, task.filename, task.url, task.rel_path))
|
|
72
|
+
self._record_seen(task.source, is_new)
|
|
73
|
+
try:
|
|
74
|
+
needs, expected_size = self._needs_work(task)
|
|
75
|
+
except Exception as exc:
|
|
76
|
+
self.db.mark(task.source, task.filename, status="failed", error=str(exc))
|
|
77
|
+
self._bump("failed", task.source)
|
|
78
|
+
LOG.error("metadata failed for %s: %s", task.filename, exc)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if not needs:
|
|
82
|
+
self._bump("skipped", task.source)
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
if self.cfg.dry_run:
|
|
86
|
+
self._bump("downloaded", task.source, expected_size or 0)
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
expected_md5 = None
|
|
90
|
+
if task.md5_url:
|
|
91
|
+
with contextlib.suppress(Exception):
|
|
92
|
+
expected_md5 = md5_file_from_url(self.http.get_text(task.md5_url))
|
|
93
|
+
self.db.mark(task.source, task.filename, md5=expected_md5)
|
|
94
|
+
|
|
95
|
+
task_id = download_progress.add_task(task.filename, expected_size)
|
|
96
|
+
|
|
97
|
+
def progress_cb(nbytes: int):
|
|
98
|
+
download_progress.update(task_id, advance=nbytes)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
attempts = (self.db.get(task.source, task.filename)["attempts"] or 0) + 1
|
|
102
|
+
self.db.mark(task.source, task.filename, status="pending", attempts=attempts)
|
|
103
|
+
written = self.http.download(task.url, task.dest, expected_size, progress_cb)
|
|
104
|
+
download_progress.update(task_id, completed=written)
|
|
105
|
+
|
|
106
|
+
if expected_md5:
|
|
107
|
+
actual = md5_file(task.dest)
|
|
108
|
+
if actual.lower() != expected_md5.lower():
|
|
109
|
+
task.dest.unlink(missing_ok=True)
|
|
110
|
+
raise IOError(f"md5 mismatch: got {actual}, expected {expected_md5}")
|
|
111
|
+
self.db.mark(task.source, task.filename,
|
|
112
|
+
status="verified", local_md5=actual, error=None)
|
|
113
|
+
self._bump("verified", task.source, written)
|
|
114
|
+
else:
|
|
115
|
+
self.db.mark(task.source, task.filename, status="verified", error=None)
|
|
116
|
+
self._bump("verified", task.source, written)
|
|
117
|
+
|
|
118
|
+
if task.extract:
|
|
119
|
+
self._extract_zip(task)
|
|
120
|
+
self._count_and_store(task)
|
|
121
|
+
LOG.info("ok %s", task.rel_path)
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
self.db.mark(task.source, task.filename, status="failed", error=str(exc))
|
|
124
|
+
self._bump("failed", task.source)
|
|
125
|
+
LOG.error("download failed for %s: %s", task.filename, exc)
|
|
126
|
+
|
|
127
|
+
def _bump(self, key: str, source: Optional[str] = None, nbytes: int = 0) -> None:
|
|
128
|
+
with self._stats_lock:
|
|
129
|
+
self.stats[key] += 1
|
|
130
|
+
self.bytes_downloaded += nbytes
|
|
131
|
+
if source is not None:
|
|
132
|
+
s = self.per_source.setdefault(source, new_src_stats())
|
|
133
|
+
s[key] += 1
|
|
134
|
+
s["bytes"] += nbytes
|
|
135
|
+
|
|
136
|
+
def _record_seen(self, source: str, is_new: bool) -> None:
|
|
137
|
+
with self._stats_lock:
|
|
138
|
+
s = self.per_source.setdefault(source, new_src_stats())
|
|
139
|
+
s["new" if is_new else "existing"] += 1
|
|
140
|
+
|
|
141
|
+
def _extract_zip(self, task: Task) -> None:
|
|
142
|
+
if not task.dest.exists():
|
|
143
|
+
return
|
|
144
|
+
name = task.dest.name
|
|
145
|
+
if name.endswith(".json.zip"):
|
|
146
|
+
extract_dir = task.dest.parent / name[:-9]
|
|
147
|
+
else:
|
|
148
|
+
extract_dir = task.dest.with_suffix("")
|
|
149
|
+
if extract_dir.exists():
|
|
150
|
+
zip_mtime = task.dest.stat().st_mtime
|
|
151
|
+
dir_mtime = max(
|
|
152
|
+
(p.stat().st_mtime for p in extract_dir.rglob("*") if p.is_file()),
|
|
153
|
+
default=0,
|
|
154
|
+
)
|
|
155
|
+
if dir_mtime >= zip_mtime:
|
|
156
|
+
return
|
|
157
|
+
shutil.rmtree(extract_dir, ignore_errors=True)
|
|
158
|
+
self.ui.extract(task.rel_path)
|
|
159
|
+
extract_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
with zipfile.ZipFile(task.dest, "r") as zf:
|
|
161
|
+
zf.extractall(extract_dir)
|
|
162
|
+
|
|
163
|
+
def _count_and_store(self, task: Task) -> None:
|
|
164
|
+
if self.cfg.dry_run:
|
|
165
|
+
return
|
|
166
|
+
try:
|
|
167
|
+
n = count_articles(task.dest, task.source)
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
LOG.warning("article count failed for %s: %s", task.filename, exc)
|
|
170
|
+
return
|
|
171
|
+
self.db.mark(task.source, task.filename, article_count=n)
|
|
172
|
+
with self._stats_lock:
|
|
173
|
+
self.per_source.setdefault(task.source, new_src_stats())["articles"] += n
|
|
174
|
+
self.articles_downloaded += n
|
|
175
|
+
|
|
176
|
+
# ---- prune ---------------------------------------------------------- #
|
|
177
|
+
|
|
178
|
+
def _prune(self, tasks_by_source: dict[str, list[Task]]) -> None:
|
|
179
|
+
remote_by_source = {src: {t.filename for t in tasks}
|
|
180
|
+
for src, tasks in tasks_by_source.items()}
|
|
181
|
+
synced_families = set(self.cfg.sources)
|
|
182
|
+
|
|
183
|
+
def family(source: str) -> str:
|
|
184
|
+
return source.split("_", 1)[0]
|
|
185
|
+
|
|
186
|
+
for source in self.db.all_sources():
|
|
187
|
+
if family(source) not in synced_families:
|
|
188
|
+
continue
|
|
189
|
+
remote = remote_by_source.get(source, set())
|
|
190
|
+
for fname in self.db.known_filenames(source) - remote:
|
|
191
|
+
row = self.db.get(source, fname)
|
|
192
|
+
if row and row["rel_path"]:
|
|
193
|
+
p = self.cfg.data_root / row["rel_path"]
|
|
194
|
+
if p.exists():
|
|
195
|
+
LOG.info("prune (no longer remote): %s", row["rel_path"])
|
|
196
|
+
if not self.cfg.dry_run:
|
|
197
|
+
p.unlink(missing_ok=True)
|
|
198
|
+
|
|
199
|
+
# ---- run ------------------------------------------------------------ #
|
|
200
|
+
|
|
201
|
+
def run(self) -> int:
|
|
202
|
+
planners = []
|
|
203
|
+
if "pubmed" in self.cfg.sources:
|
|
204
|
+
planners.append(PubMedSource(self.cfg, self.http))
|
|
205
|
+
if "pmc" in self.cfg.sources:
|
|
206
|
+
planners.append(PmcSource(self.cfg, self.http))
|
|
207
|
+
if "fda" in self.cfg.sources:
|
|
208
|
+
planners.append(FdaSource(self.cfg, self.http))
|
|
209
|
+
if "clinicaltrials" in self.cfg.sources:
|
|
210
|
+
planners.append(ClinicalTrialsSource(self.cfg, self.http))
|
|
211
|
+
|
|
212
|
+
all_tasks: list[Task] = []
|
|
213
|
+
by_source: dict[str, list[Task]] = {}
|
|
214
|
+
for p in planners:
|
|
215
|
+
self.ui.planning(type(p).__name__)
|
|
216
|
+
tasks = p.plan()
|
|
217
|
+
all_tasks.extend(tasks)
|
|
218
|
+
for t in tasks:
|
|
219
|
+
by_source.setdefault(t.source, []).append(t)
|
|
220
|
+
self.source_urls.setdefault(t.source, t.url.rsplit("/", 1)[0] + "/")
|
|
221
|
+
self.ui.planned(len(all_tasks), len(by_source))
|
|
222
|
+
|
|
223
|
+
with self.ui.metadata_progress(len(all_tasks)) as meta, \
|
|
224
|
+
self.ui.download_progress() as dl:
|
|
225
|
+
def work(task: Task):
|
|
226
|
+
self._process(task, dl)
|
|
227
|
+
meta.advance(1)
|
|
228
|
+
|
|
229
|
+
with ThreadPoolExecutor(max_workers=self.cfg.workers) as pool:
|
|
230
|
+
futures = [pool.submit(work, t) for t in all_tasks]
|
|
231
|
+
for _ in as_completed(futures):
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
if self.cfg.prune:
|
|
235
|
+
self._prune(by_source)
|
|
236
|
+
|
|
237
|
+
self._report()
|
|
238
|
+
self.db.close()
|
|
239
|
+
return 1 if self.stats["failed"] else 0
|
|
240
|
+
|
|
241
|
+
# ---- article-count backfill (no network) --------------------------- #
|
|
242
|
+
|
|
243
|
+
def backfill_counts(self) -> int:
|
|
244
|
+
rows = self.db.files_missing_counts()
|
|
245
|
+
self.ui.planned(len(rows), 0)
|
|
246
|
+
done = self.failed = 0
|
|
247
|
+
|
|
248
|
+
def work(item: tuple[str, str, str]) -> None:
|
|
249
|
+
nonlocal done
|
|
250
|
+
source, filename, rel_path = item
|
|
251
|
+
path = self.cfg.data_root / rel_path
|
|
252
|
+
if not path.exists():
|
|
253
|
+
return
|
|
254
|
+
try:
|
|
255
|
+
n = count_articles(path, source)
|
|
256
|
+
except Exception as exc:
|
|
257
|
+
with self._stats_lock:
|
|
258
|
+
self.failed += 1
|
|
259
|
+
LOG.error("count failed for %s: %s", rel_path, exc)
|
|
260
|
+
return
|
|
261
|
+
self.db.mark(source, filename, article_count=n)
|
|
262
|
+
with self._stats_lock:
|
|
263
|
+
done += 1
|
|
264
|
+
self.per_source.setdefault(source, new_src_stats())["articles"] += n
|
|
265
|
+
self.articles_downloaded += n
|
|
266
|
+
|
|
267
|
+
with ThreadPoolExecutor(max_workers=self.cfg.workers) as pool:
|
|
268
|
+
for _ in as_completed([pool.submit(work, it) for it in rows]):
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
self.stats["failed"] = self.failed
|
|
272
|
+
self._report()
|
|
273
|
+
self.db.close()
|
|
274
|
+
return 1 if self.failed else 0
|
|
275
|
+
|
|
276
|
+
# ---- post-sync summary --------------------------------------------- #
|
|
277
|
+
|
|
278
|
+
def _report(self) -> None:
|
|
279
|
+
finished = utcnow()
|
|
280
|
+
mirror = self.db.summary_by_source()
|
|
281
|
+
self.write_json_manifest(finished, mirror)
|
|
282
|
+
self.ui.summary(
|
|
283
|
+
self.started_at, finished, self.stats, self.per_source, mirror,
|
|
284
|
+
self.source_urls, self.bytes_downloaded, self.articles_downloaded,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
def write_json_manifest(self, finished: str, mirror: dict) -> None:
|
|
288
|
+
sources = sorted(set(mirror) | set(self.per_source))
|
|
289
|
+
new_dl = self.stats["verified"] + self.stats["downloaded"]
|
|
290
|
+
manifest = {
|
|
291
|
+
"started_at": self.started_at,
|
|
292
|
+
"finished_at": finished,
|
|
293
|
+
"this_run": {
|
|
294
|
+
"newly_downloaded": new_dl,
|
|
295
|
+
"already_current": self.stats["skipped"],
|
|
296
|
+
"failed": self.stats["failed"],
|
|
297
|
+
"bytes_downloaded": self.bytes_downloaded,
|
|
298
|
+
"articles_added": self.articles_downloaded,
|
|
299
|
+
},
|
|
300
|
+
"sources": {
|
|
301
|
+
source: {
|
|
302
|
+
"url": self.source_urls.get(source),
|
|
303
|
+
"mirror_files": mirror.get(source, {}).get("files", 0),
|
|
304
|
+
"mirror_bytes": mirror.get(source, {}).get("bytes", 0),
|
|
305
|
+
"mirror_articles": mirror.get(source, {}).get("articles", 0),
|
|
306
|
+
"files_counted": mirror.get(source, {}).get("counted", 0),
|
|
307
|
+
**self.per_source.get(source, new_src_stats()),
|
|
308
|
+
}
|
|
309
|
+
for source in sources
|
|
310
|
+
},
|
|
311
|
+
"mirror_total": {
|
|
312
|
+
"files": sum(m.get("files", 0) for m in mirror.values()),
|
|
313
|
+
"bytes": sum(m.get("bytes", 0) for m in mirror.values()),
|
|
314
|
+
"articles": sum(m.get("articles", 0) for m in mirror.values()),
|
|
315
|
+
},
|
|
316
|
+
}
|
|
317
|
+
out = self.cfg.log_dir / f"summary_{dt.date.today().isoformat()}.json"
|
|
318
|
+
try:
|
|
319
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
320
|
+
out.write_text(json.dumps(manifest, indent=2))
|
|
321
|
+
LOG.info("wrote summary manifest: %s", out)
|
|
322
|
+
except OSError as exc:
|
|
323
|
+
LOG.warning("could not write summary manifest: %s", exc)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def md5_file_from_url(text: str) -> str:
|
|
327
|
+
import re
|
|
328
|
+
m = re.search(r"=\s*([0-9a-fA-F]{32})", text)
|
|
329
|
+
if m:
|
|
330
|
+
return m.group(1)
|
|
331
|
+
m = re.search(r"\b([0-9a-fA-F]{32})\b", text)
|
|
332
|
+
if not m:
|
|
333
|
+
raise ValueError(f"could not parse md5 from: {text[:120]!r}")
|
|
334
|
+
return m.group(1)
|
litsync/ui.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.panel import Panel
|
|
8
|
+
from rich.progress import (
|
|
9
|
+
BarColumn,
|
|
10
|
+
DownloadColumn,
|
|
11
|
+
Progress,
|
|
12
|
+
SpinnerColumn,
|
|
13
|
+
TaskID,
|
|
14
|
+
TextColumn,
|
|
15
|
+
TimeRemainingColumn,
|
|
16
|
+
TransferSpeedColumn,
|
|
17
|
+
)
|
|
18
|
+
from rich.table import Table
|
|
19
|
+
from rich.text import Text
|
|
20
|
+
|
|
21
|
+
from litsync.utils import human_bytes, partial_note
|
|
22
|
+
|
|
23
|
+
console = Console()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class UI:
|
|
27
|
+
"""Plain-text fallback UI (no colors, no progress bars)."""
|
|
28
|
+
|
|
29
|
+
def planning(self, name: str):
|
|
30
|
+
print(f"Planning {name} ...")
|
|
31
|
+
|
|
32
|
+
def planned(self, total: int, sources: int):
|
|
33
|
+
print(f"Planned {total} files across {sources} source groups")
|
|
34
|
+
|
|
35
|
+
@contextlib.contextmanager
|
|
36
|
+
def metadata_progress(self, total: int):
|
|
37
|
+
yield _NoopProgress(total)
|
|
38
|
+
|
|
39
|
+
@contextlib.contextmanager
|
|
40
|
+
def download_progress(self):
|
|
41
|
+
yield _NoopDownloadProgress()
|
|
42
|
+
|
|
43
|
+
def extract(self, rel_path: str):
|
|
44
|
+
print(f"Extracting {rel_path}")
|
|
45
|
+
|
|
46
|
+
def summary(self, started: str, finished: str, stats: dict, per_source: dict,
|
|
47
|
+
mirror: dict, source_urls: dict, bytes_downloaded: int, articles_downloaded: int):
|
|
48
|
+
new_dl = stats.get("verified", 0) + stats.get("downloaded", 0)
|
|
49
|
+
print("litsync run complete")
|
|
50
|
+
print(f" started: {started}")
|
|
51
|
+
print(f" finished: {finished}")
|
|
52
|
+
print(f" newly downloaded: {new_dl}, skipped: {stats.get('skipped', 0)}, failed: {stats.get('failed', 0)}")
|
|
53
|
+
print(f" bytes: {human_bytes(bytes_downloaded)}, articles: {articles_downloaded:,}")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class _NoopProgress:
|
|
57
|
+
def __init__(self, total: int):
|
|
58
|
+
self.total = total
|
|
59
|
+
|
|
60
|
+
def advance(self, n: int = 1):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def set_description(self, desc: str):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class _NoopDownloadProgress:
|
|
68
|
+
def add_task(self, description: str, total: Optional[int]) -> int:
|
|
69
|
+
return 0
|
|
70
|
+
|
|
71
|
+
def update(self, task_id: int, advance: int = 0, completed: Optional[int] = None,
|
|
72
|
+
description: Optional[str] = None):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class RichUI(UI):
|
|
77
|
+
"""Rich-based progress bars and tables."""
|
|
78
|
+
|
|
79
|
+
def planning(self, name: str):
|
|
80
|
+
console.print(f"[cyan]Planning[/cyan] {name} ...")
|
|
81
|
+
|
|
82
|
+
def planned(self, total: int, sources: int):
|
|
83
|
+
console.print(f"[green]Planned {total} files across {sources} source groups[/green]")
|
|
84
|
+
|
|
85
|
+
@contextlib.contextmanager
|
|
86
|
+
def metadata_progress(self, total: int):
|
|
87
|
+
progress = Progress(
|
|
88
|
+
SpinnerColumn(),
|
|
89
|
+
TextColumn("[bold blue]{task.description}"),
|
|
90
|
+
BarColumn(),
|
|
91
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
92
|
+
TextColumn("({task.completed}/{task.total})"),
|
|
93
|
+
console=console,
|
|
94
|
+
transient=True,
|
|
95
|
+
)
|
|
96
|
+
task = progress.add_task("Checking metadata...", total=total)
|
|
97
|
+
try:
|
|
98
|
+
with progress:
|
|
99
|
+
yield _RichProgress(task, progress)
|
|
100
|
+
finally:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
@contextlib.contextmanager
|
|
104
|
+
def download_progress(self):
|
|
105
|
+
progress = Progress(
|
|
106
|
+
SpinnerColumn(),
|
|
107
|
+
TextColumn("[bold blue]{task.description}"),
|
|
108
|
+
BarColumn(bar_width=40),
|
|
109
|
+
"[progress.percentage]{task.percentage:>3.0f}%",
|
|
110
|
+
" ",
|
|
111
|
+
DownloadColumn(),
|
|
112
|
+
" ",
|
|
113
|
+
TransferSpeedColumn(),
|
|
114
|
+
" ",
|
|
115
|
+
TimeRemainingColumn(),
|
|
116
|
+
console=console,
|
|
117
|
+
)
|
|
118
|
+
try:
|
|
119
|
+
with progress:
|
|
120
|
+
yield _RichDownloadProgress(progress)
|
|
121
|
+
finally:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
def extract(self, rel_path: str):
|
|
125
|
+
console.print(f"[yellow]Extracting[/yellow] {rel_path}")
|
|
126
|
+
|
|
127
|
+
def summary(self, started: str, finished: str, stats: dict, per_source: dict,
|
|
128
|
+
mirror: dict, source_urls: dict, bytes_downloaded: int, articles_downloaded: int):
|
|
129
|
+
new_dl = stats.get("verified", 0) + stats.get("downloaded", 0)
|
|
130
|
+
table = Table(title="litsync summary", show_header=True, header_style="bold magenta")
|
|
131
|
+
table.add_column("Source", style="cyan")
|
|
132
|
+
table.add_column("Files", justify="right")
|
|
133
|
+
table.add_column("Size", justify="right")
|
|
134
|
+
table.add_column("Records", justify="right")
|
|
135
|
+
table.add_column("Run", justify="left")
|
|
136
|
+
|
|
137
|
+
labels = {
|
|
138
|
+
"pubmed": "PubMed",
|
|
139
|
+
"pmc": "PubMed Central",
|
|
140
|
+
"fda": "openFDA",
|
|
141
|
+
"clinicaltrials": "ClinicalTrials.gov",
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
families: dict[str, list[str]] = {}
|
|
145
|
+
sources = sorted(set(mirror) | set(per_source))
|
|
146
|
+
for source in sources:
|
|
147
|
+
families.setdefault(source.split("_", 1)[0], []).append(source)
|
|
148
|
+
|
|
149
|
+
grand_files = grand_bytes = grand_articles = grand_counted = 0
|
|
150
|
+
for fam in sorted(families):
|
|
151
|
+
fam_files = fam_bytes = fam_articles = fam_counted = 0
|
|
152
|
+
for source in families[fam]:
|
|
153
|
+
m = mirror.get(source, {"files": 0, "bytes": 0, "articles": 0, "counted": 0})
|
|
154
|
+
r = per_source.get(source, new_src_stats())
|
|
155
|
+
got = r.get("verified", 0) + r.get("downloaded", 0)
|
|
156
|
+
if m["counted"]:
|
|
157
|
+
art = f"{m['articles']:,}"
|
|
158
|
+
if m["counted"] < m["files"]:
|
|
159
|
+
art += f" (counted {m['counted']}/{m['files']})"
|
|
160
|
+
else:
|
|
161
|
+
art = "—"
|
|
162
|
+
run_txt = f"+{r.get('new', 0)} new, {got} fetched, {r.get('skipped', 0)} current, {r.get('failed', 0)} failed"
|
|
163
|
+
table.add_row(source, str(m["files"]), human_bytes(m["bytes"]), art, run_txt)
|
|
164
|
+
fam_files += m["files"]
|
|
165
|
+
fam_bytes += m["bytes"]
|
|
166
|
+
fam_articles += m["articles"]
|
|
167
|
+
fam_counted += m["counted"]
|
|
168
|
+
table.add_row(
|
|
169
|
+
f"[bold]{labels.get(fam, fam)} subtotal[/bold]",
|
|
170
|
+
str(fam_files),
|
|
171
|
+
human_bytes(fam_bytes),
|
|
172
|
+
f"{fam_articles:,}{partial_note(fam_counted, fam_files)}",
|
|
173
|
+
"",
|
|
174
|
+
)
|
|
175
|
+
grand_files += fam_files
|
|
176
|
+
grand_bytes += fam_bytes
|
|
177
|
+
grand_articles += fam_articles
|
|
178
|
+
grand_counted += fam_counted
|
|
179
|
+
|
|
180
|
+
table.add_row(
|
|
181
|
+
"[bold]TOTAL[/bold]",
|
|
182
|
+
str(grand_files),
|
|
183
|
+
human_bytes(grand_bytes),
|
|
184
|
+
f"{grand_articles:,}{partial_note(grand_counted, grand_files)}",
|
|
185
|
+
f"+{new_dl} downloaded, {stats.get('skipped', 0)} skipped, {stats.get('failed', 0)} failed",
|
|
186
|
+
style="bold green",
|
|
187
|
+
)
|
|
188
|
+
console.print()
|
|
189
|
+
console.print(table)
|
|
190
|
+
console.print(
|
|
191
|
+
f"[dim]started: {started} · finished: {finished} · "
|
|
192
|
+
f" bytes this run: {human_bytes(bytes_downloaded)} · "
|
|
193
|
+
f" records this run: {articles_downloaded:,}[/dim]"
|
|
194
|
+
)
|
|
195
|
+
if grand_counted < grand_files:
|
|
196
|
+
console.print(
|
|
197
|
+
"[dim]Run --count-articles to backfill record counts for already-downloaded files.[/dim]"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class _RichProgress:
|
|
202
|
+
def __init__(self, task: TaskID, progress: Progress):
|
|
203
|
+
self.task = task
|
|
204
|
+
self.progress = progress
|
|
205
|
+
|
|
206
|
+
def advance(self, n: int = 1):
|
|
207
|
+
self.progress.advance(self.task, n)
|
|
208
|
+
|
|
209
|
+
def set_description(self, desc: str):
|
|
210
|
+
self.progress.update(self.task, description=desc)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class _RichDownloadProgress:
|
|
214
|
+
def __init__(self, progress: Progress):
|
|
215
|
+
self.progress = progress
|
|
216
|
+
|
|
217
|
+
def add_task(self, description: str, total: Optional[int]) -> int:
|
|
218
|
+
return self.progress.add_task(description, total=total)
|
|
219
|
+
|
|
220
|
+
def update(self, task_id: int, advance: int = 0, completed: Optional[int] = None,
|
|
221
|
+
description: Optional[str] = None):
|
|
222
|
+
kwargs = {"advance": advance}
|
|
223
|
+
if completed is not None:
|
|
224
|
+
kwargs["completed"] = completed
|
|
225
|
+
if description is not None:
|
|
226
|
+
kwargs["description"] = description
|
|
227
|
+
self.progress.update(task_id, **kwargs)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def new_src_stats() -> dict:
|
|
231
|
+
return {"new": 0, "existing": 0, "skipped": 0,
|
|
232
|
+
"downloaded": 0, "verified": 0, "failed": 0, "bytes": 0, "articles": 0}
|
litsync/utils.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import datetime as dt
|
|
5
|
+
import fcntl
|
|
6
|
+
import gzip
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import tarfile
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
LOG = logging.getLogger("litsync")
|
|
16
|
+
CHUNK = 1 << 20 # 1 MiB
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def utcnow() -> str:
|
|
20
|
+
return dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def new_src_stats() -> dict:
|
|
24
|
+
return {"new": 0, "existing": 0, "skipped": 0,
|
|
25
|
+
"downloaded": 0, "verified": 0, "failed": 0, "bytes": 0, "articles": 0}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def human_bytes(n) -> str:
|
|
29
|
+
size = float(n or 0)
|
|
30
|
+
for unit in ("B", "KB", "MB", "GB", "TB"):
|
|
31
|
+
if size < 1024:
|
|
32
|
+
return f"{int(size)} {unit}" if unit == "B" else f"{size:.1f} {unit}"
|
|
33
|
+
size /= 1024
|
|
34
|
+
return f"{size:.1f} PB"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def partial_note(counted: int, files: int) -> str:
|
|
38
|
+
return "" if counted >= files else f" [partial: {counted}/{files} files counted]"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def md5_file(path: Path) -> str:
|
|
42
|
+
h = hashlib.md5()
|
|
43
|
+
with open(path, "rb") as fh:
|
|
44
|
+
for chunk in iter(lambda: fh.read(CHUNK), b""):
|
|
45
|
+
h.update(chunk)
|
|
46
|
+
return h.hexdigest()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_md5(text: str) -> str:
|
|
50
|
+
"""NCBI .md5 sidecars look like 'MD5(file.xml.gz)= <hex>' or '<hex> file'."""
|
|
51
|
+
m = re.search(r"=\s*([0-9a-fA-F]{32})", text)
|
|
52
|
+
if m:
|
|
53
|
+
return m.group(1)
|
|
54
|
+
m = re.search(r"\b([0-9a-fA-F]{32})\b", text)
|
|
55
|
+
if not m:
|
|
56
|
+
raise ValueError(f"could not parse md5 from: {text[:120]!r}")
|
|
57
|
+
return m.group(1)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@contextlib.contextmanager
|
|
61
|
+
def run_lock(path: Path):
|
|
62
|
+
"""Prevent overlapping runs via an exclusive file lock."""
|
|
63
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
fh = open(path, "w")
|
|
65
|
+
try:
|
|
66
|
+
try:
|
|
67
|
+
fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
68
|
+
except BlockingIOError:
|
|
69
|
+
raise SystemExit("another litsync run is already in progress; exiting")
|
|
70
|
+
fh.write(f"{os.getpid()} {utcnow()}\n")
|
|
71
|
+
fh.flush()
|
|
72
|
+
yield
|
|
73
|
+
finally:
|
|
74
|
+
with contextlib.suppress(Exception):
|
|
75
|
+
fcntl.flock(fh, fcntl.LOCK_UN)
|
|
76
|
+
fh.close()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def count_articles(path: Path, source: str) -> int:
|
|
80
|
+
"""Cheaply count the articles/records inside one downloaded file."""
|
|
81
|
+
name = path.name.lower()
|
|
82
|
+
if source.startswith("pubmed"):
|
|
83
|
+
if not name.endswith(".xml.gz"):
|
|
84
|
+
return 0
|
|
85
|
+
needle = b"<PubmedArticle>"
|
|
86
|
+
overlap = b""
|
|
87
|
+
count = 0
|
|
88
|
+
with gzip.open(path, "rb") as fh:
|
|
89
|
+
for chunk in iter(lambda: fh.read(CHUNK), b""):
|
|
90
|
+
buf = overlap + chunk
|
|
91
|
+
count += buf.count(needle)
|
|
92
|
+
overlap = buf[-(len(needle) - 1):]
|
|
93
|
+
return count
|
|
94
|
+
if source.startswith("pmc"):
|
|
95
|
+
if not name.endswith(".tar.gz"):
|
|
96
|
+
return 0
|
|
97
|
+
count = 0
|
|
98
|
+
with tarfile.open(path, mode="r|gz") as tar:
|
|
99
|
+
for member in tar:
|
|
100
|
+
low = member.name.lower()
|
|
101
|
+
if member.isfile() and (low.endswith(".xml") or low.endswith(".nxml")):
|
|
102
|
+
count += 1
|
|
103
|
+
return count
|
|
104
|
+
if source.startswith("fda"):
|
|
105
|
+
if not name.endswith(".zip"):
|
|
106
|
+
return 0
|
|
107
|
+
import json
|
|
108
|
+
extract_dir = path.parent / name[:-9] if name.endswith(".json.zip") else path.with_suffix("")
|
|
109
|
+
count = 0
|
|
110
|
+
for json_file in extract_dir.rglob("*.json"):
|
|
111
|
+
try:
|
|
112
|
+
data = json.loads(json_file.read_text(encoding="utf-8"))
|
|
113
|
+
count += len(data.get("results", []))
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
return count
|
|
117
|
+
if source.startswith("clinicaltrials"):
|
|
118
|
+
if not name.endswith(".zip"):
|
|
119
|
+
return 0
|
|
120
|
+
extract_dir = path.with_suffix("")
|
|
121
|
+
return sum(1 for p in extract_dir.rglob("*.xml") if p.is_file())
|
|
122
|
+
return 0
|