litsync 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
litsync/sync.py ADDED
@@ -0,0 +1,334 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import datetime as dt
5
+ import json
6
+ import logging
7
+ import shutil
8
+ import threading
9
+ import zipfile
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+ from litsync.config import Config
15
+ from litsync.http import HttpClient
16
+ from litsync.sources import Task
17
+ from litsync.sources.clinicaltrials import ClinicalTrialsSource
18
+ from litsync.sources.fda import FdaSource
19
+ from litsync.sources.pmc import PmcSource
20
+ from litsync.sources.pubmed import PubMedSource
21
+ from litsync.state import FileRecord, StateDB
22
+ from litsync.ui import UI
23
+ from litsync.utils import count_articles, human_bytes, md5_file, new_src_stats, utcnow
24
+
25
+ LOG = logging.getLogger("litsync")
26
+
27
+
28
+ class Syncer:
29
+ def __init__(self, cfg: Config, ui: UI):
30
+ self.cfg = cfg
31
+ self.ui = ui
32
+ self.http = HttpClient(cfg)
33
+ self.db = StateDB(cfg.db_path)
34
+ self._stats_lock = threading.Lock()
35
+ self.stats = {"skipped": 0, "downloaded": 0, "verified": 0, "failed": 0}
36
+ self.bytes_downloaded = 0
37
+ self.articles_downloaded = 0
38
+ self.per_source: dict[str, dict] = {}
39
+ self.source_urls: dict[str, str] = {}
40
+ self.started_at = utcnow()
41
+
42
+ # ---- per-file decision logic ---------------------------------------- #
43
+
44
+ def _needs_work(self, task: Task) -> tuple[bool, Optional[int]]:
45
+ row = self.db.get(task.source, task.filename)
46
+ on_disk = task.dest.exists()
47
+
48
+ if (row is not None and row["status"] == "verified" and on_disk
49
+ and not self.cfg.reverify and task.immutable):
50
+ recorded = row["remote_size"]
51
+ if recorded is None or task.dest.stat().st_size == recorded:
52
+ return False, recorded
53
+
54
+ size, mtime, etag = self.http.head(task.url)
55
+ self.db.mark(task.source, task.filename,
56
+ remote_size=size, remote_mtime=mtime, etag=etag)
57
+
58
+ if (row is not None and on_disk and not self.cfg.reverify
59
+ and row["status"] in ("done", "verified")):
60
+ same_size = size is None or task.dest.stat().st_size == size
61
+ same_etag = etag is None or row["etag"] == etag
62
+ if same_size and same_etag:
63
+ return False, size
64
+
65
+ return True, size
66
+
67
+ # ---- worker --------------------------------------------------------- #
68
+
69
+ def _process(self, task: Task, download_progress) -> None:
70
+ is_new = self.db.get(task.source, task.filename) is None
71
+ self.db.upsert_seen(FileRecord(task.source, task.filename, task.url, task.rel_path))
72
+ self._record_seen(task.source, is_new)
73
+ try:
74
+ needs, expected_size = self._needs_work(task)
75
+ except Exception as exc:
76
+ self.db.mark(task.source, task.filename, status="failed", error=str(exc))
77
+ self._bump("failed", task.source)
78
+ LOG.error("metadata failed for %s: %s", task.filename, exc)
79
+ return
80
+
81
+ if not needs:
82
+ self._bump("skipped", task.source)
83
+ return
84
+
85
+ if self.cfg.dry_run:
86
+ self._bump("downloaded", task.source, expected_size or 0)
87
+ return
88
+
89
+ expected_md5 = None
90
+ if task.md5_url:
91
+ with contextlib.suppress(Exception):
92
+ expected_md5 = md5_file_from_url(self.http.get_text(task.md5_url))
93
+ self.db.mark(task.source, task.filename, md5=expected_md5)
94
+
95
+ task_id = download_progress.add_task(task.filename, expected_size)
96
+
97
+ def progress_cb(nbytes: int):
98
+ download_progress.update(task_id, advance=nbytes)
99
+
100
+ try:
101
+ attempts = (self.db.get(task.source, task.filename)["attempts"] or 0) + 1
102
+ self.db.mark(task.source, task.filename, status="pending", attempts=attempts)
103
+ written = self.http.download(task.url, task.dest, expected_size, progress_cb)
104
+ download_progress.update(task_id, completed=written)
105
+
106
+ if expected_md5:
107
+ actual = md5_file(task.dest)
108
+ if actual.lower() != expected_md5.lower():
109
+ task.dest.unlink(missing_ok=True)
110
+ raise IOError(f"md5 mismatch: got {actual}, expected {expected_md5}")
111
+ self.db.mark(task.source, task.filename,
112
+ status="verified", local_md5=actual, error=None)
113
+ self._bump("verified", task.source, written)
114
+ else:
115
+ self.db.mark(task.source, task.filename, status="verified", error=None)
116
+ self._bump("verified", task.source, written)
117
+
118
+ if task.extract:
119
+ self._extract_zip(task)
120
+ self._count_and_store(task)
121
+ LOG.info("ok %s", task.rel_path)
122
+ except Exception as exc:
123
+ self.db.mark(task.source, task.filename, status="failed", error=str(exc))
124
+ self._bump("failed", task.source)
125
+ LOG.error("download failed for %s: %s", task.filename, exc)
126
+
127
+ def _bump(self, key: str, source: Optional[str] = None, nbytes: int = 0) -> None:
128
+ with self._stats_lock:
129
+ self.stats[key] += 1
130
+ self.bytes_downloaded += nbytes
131
+ if source is not None:
132
+ s = self.per_source.setdefault(source, new_src_stats())
133
+ s[key] += 1
134
+ s["bytes"] += nbytes
135
+
136
+ def _record_seen(self, source: str, is_new: bool) -> None:
137
+ with self._stats_lock:
138
+ s = self.per_source.setdefault(source, new_src_stats())
139
+ s["new" if is_new else "existing"] += 1
140
+
141
+ def _extract_zip(self, task: Task) -> None:
142
+ if not task.dest.exists():
143
+ return
144
+ name = task.dest.name
145
+ if name.endswith(".json.zip"):
146
+ extract_dir = task.dest.parent / name[:-9]
147
+ else:
148
+ extract_dir = task.dest.with_suffix("")
149
+ if extract_dir.exists():
150
+ zip_mtime = task.dest.stat().st_mtime
151
+ dir_mtime = max(
152
+ (p.stat().st_mtime for p in extract_dir.rglob("*") if p.is_file()),
153
+ default=0,
154
+ )
155
+ if dir_mtime >= zip_mtime:
156
+ return
157
+ shutil.rmtree(extract_dir, ignore_errors=True)
158
+ self.ui.extract(task.rel_path)
159
+ extract_dir.parent.mkdir(parents=True, exist_ok=True)
160
+ with zipfile.ZipFile(task.dest, "r") as zf:
161
+ zf.extractall(extract_dir)
162
+
163
+ def _count_and_store(self, task: Task) -> None:
164
+ if self.cfg.dry_run:
165
+ return
166
+ try:
167
+ n = count_articles(task.dest, task.source)
168
+ except Exception as exc:
169
+ LOG.warning("article count failed for %s: %s", task.filename, exc)
170
+ return
171
+ self.db.mark(task.source, task.filename, article_count=n)
172
+ with self._stats_lock:
173
+ self.per_source.setdefault(task.source, new_src_stats())["articles"] += n
174
+ self.articles_downloaded += n
175
+
176
+ # ---- prune ---------------------------------------------------------- #
177
+
178
+ def _prune(self, tasks_by_source: dict[str, list[Task]]) -> None:
179
+ remote_by_source = {src: {t.filename for t in tasks}
180
+ for src, tasks in tasks_by_source.items()}
181
+ synced_families = set(self.cfg.sources)
182
+
183
+ def family(source: str) -> str:
184
+ return source.split("_", 1)[0]
185
+
186
+ for source in self.db.all_sources():
187
+ if family(source) not in synced_families:
188
+ continue
189
+ remote = remote_by_source.get(source, set())
190
+ for fname in self.db.known_filenames(source) - remote:
191
+ row = self.db.get(source, fname)
192
+ if row and row["rel_path"]:
193
+ p = self.cfg.data_root / row["rel_path"]
194
+ if p.exists():
195
+ LOG.info("prune (no longer remote): %s", row["rel_path"])
196
+ if not self.cfg.dry_run:
197
+ p.unlink(missing_ok=True)
198
+
199
+ # ---- run ------------------------------------------------------------ #
200
+
201
+ def run(self) -> int:
202
+ planners = []
203
+ if "pubmed" in self.cfg.sources:
204
+ planners.append(PubMedSource(self.cfg, self.http))
205
+ if "pmc" in self.cfg.sources:
206
+ planners.append(PmcSource(self.cfg, self.http))
207
+ if "fda" in self.cfg.sources:
208
+ planners.append(FdaSource(self.cfg, self.http))
209
+ if "clinicaltrials" in self.cfg.sources:
210
+ planners.append(ClinicalTrialsSource(self.cfg, self.http))
211
+
212
+ all_tasks: list[Task] = []
213
+ by_source: dict[str, list[Task]] = {}
214
+ for p in planners:
215
+ self.ui.planning(type(p).__name__)
216
+ tasks = p.plan()
217
+ all_tasks.extend(tasks)
218
+ for t in tasks:
219
+ by_source.setdefault(t.source, []).append(t)
220
+ self.source_urls.setdefault(t.source, t.url.rsplit("/", 1)[0] + "/")
221
+ self.ui.planned(len(all_tasks), len(by_source))
222
+
223
+ with self.ui.metadata_progress(len(all_tasks)) as meta, \
224
+ self.ui.download_progress() as dl:
225
+ def work(task: Task):
226
+ self._process(task, dl)
227
+ meta.advance(1)
228
+
229
+ with ThreadPoolExecutor(max_workers=self.cfg.workers) as pool:
230
+ futures = [pool.submit(work, t) for t in all_tasks]
231
+ for _ in as_completed(futures):
232
+ pass
233
+
234
+ if self.cfg.prune:
235
+ self._prune(by_source)
236
+
237
+ self._report()
238
+ self.db.close()
239
+ return 1 if self.stats["failed"] else 0
240
+
241
+ # ---- article-count backfill (no network) --------------------------- #
242
+
243
+ def backfill_counts(self) -> int:
244
+ rows = self.db.files_missing_counts()
245
+ self.ui.planned(len(rows), 0)
246
+ done = self.failed = 0
247
+
248
+ def work(item: tuple[str, str, str]) -> None:
249
+ nonlocal done
250
+ source, filename, rel_path = item
251
+ path = self.cfg.data_root / rel_path
252
+ if not path.exists():
253
+ return
254
+ try:
255
+ n = count_articles(path, source)
256
+ except Exception as exc:
257
+ with self._stats_lock:
258
+ self.failed += 1
259
+ LOG.error("count failed for %s: %s", rel_path, exc)
260
+ return
261
+ self.db.mark(source, filename, article_count=n)
262
+ with self._stats_lock:
263
+ done += 1
264
+ self.per_source.setdefault(source, new_src_stats())["articles"] += n
265
+ self.articles_downloaded += n
266
+
267
+ with ThreadPoolExecutor(max_workers=self.cfg.workers) as pool:
268
+ for _ in as_completed([pool.submit(work, it) for it in rows]):
269
+ pass
270
+
271
+ self.stats["failed"] = self.failed
272
+ self._report()
273
+ self.db.close()
274
+ return 1 if self.failed else 0
275
+
276
+ # ---- post-sync summary --------------------------------------------- #
277
+
278
+ def _report(self) -> None:
279
+ finished = utcnow()
280
+ mirror = self.db.summary_by_source()
281
+ self.write_json_manifest(finished, mirror)
282
+ self.ui.summary(
283
+ self.started_at, finished, self.stats, self.per_source, mirror,
284
+ self.source_urls, self.bytes_downloaded, self.articles_downloaded,
285
+ )
286
+
287
+ def write_json_manifest(self, finished: str, mirror: dict) -> None:
288
+ sources = sorted(set(mirror) | set(self.per_source))
289
+ new_dl = self.stats["verified"] + self.stats["downloaded"]
290
+ manifest = {
291
+ "started_at": self.started_at,
292
+ "finished_at": finished,
293
+ "this_run": {
294
+ "newly_downloaded": new_dl,
295
+ "already_current": self.stats["skipped"],
296
+ "failed": self.stats["failed"],
297
+ "bytes_downloaded": self.bytes_downloaded,
298
+ "articles_added": self.articles_downloaded,
299
+ },
300
+ "sources": {
301
+ source: {
302
+ "url": self.source_urls.get(source),
303
+ "mirror_files": mirror.get(source, {}).get("files", 0),
304
+ "mirror_bytes": mirror.get(source, {}).get("bytes", 0),
305
+ "mirror_articles": mirror.get(source, {}).get("articles", 0),
306
+ "files_counted": mirror.get(source, {}).get("counted", 0),
307
+ **self.per_source.get(source, new_src_stats()),
308
+ }
309
+ for source in sources
310
+ },
311
+ "mirror_total": {
312
+ "files": sum(m.get("files", 0) for m in mirror.values()),
313
+ "bytes": sum(m.get("bytes", 0) for m in mirror.values()),
314
+ "articles": sum(m.get("articles", 0) for m in mirror.values()),
315
+ },
316
+ }
317
+ out = self.cfg.log_dir / f"summary_{dt.date.today().isoformat()}.json"
318
+ try:
319
+ out.parent.mkdir(parents=True, exist_ok=True)
320
+ out.write_text(json.dumps(manifest, indent=2))
321
+ LOG.info("wrote summary manifest: %s", out)
322
+ except OSError as exc:
323
+ LOG.warning("could not write summary manifest: %s", exc)
324
+
325
+
326
+ def md5_file_from_url(text: str) -> str:
327
+ import re
328
+ m = re.search(r"=\s*([0-9a-fA-F]{32})", text)
329
+ if m:
330
+ return m.group(1)
331
+ m = re.search(r"\b([0-9a-fA-F]{32})\b", text)
332
+ if not m:
333
+ raise ValueError(f"could not parse md5 from: {text[:120]!r}")
334
+ return m.group(1)
litsync/ui.py ADDED
@@ -0,0 +1,232 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ from typing import Optional
5
+
6
+ from rich.console import Console
7
+ from rich.panel import Panel
8
+ from rich.progress import (
9
+ BarColumn,
10
+ DownloadColumn,
11
+ Progress,
12
+ SpinnerColumn,
13
+ TaskID,
14
+ TextColumn,
15
+ TimeRemainingColumn,
16
+ TransferSpeedColumn,
17
+ )
18
+ from rich.table import Table
19
+ from rich.text import Text
20
+
21
+ from litsync.utils import human_bytes, partial_note
22
+
23
+ console = Console()
24
+
25
+
26
+ class UI:
27
+ """Plain-text fallback UI (no colors, no progress bars)."""
28
+
29
+ def planning(self, name: str):
30
+ print(f"Planning {name} ...")
31
+
32
+ def planned(self, total: int, sources: int):
33
+ print(f"Planned {total} files across {sources} source groups")
34
+
35
+ @contextlib.contextmanager
36
+ def metadata_progress(self, total: int):
37
+ yield _NoopProgress(total)
38
+
39
+ @contextlib.contextmanager
40
+ def download_progress(self):
41
+ yield _NoopDownloadProgress()
42
+
43
+ def extract(self, rel_path: str):
44
+ print(f"Extracting {rel_path}")
45
+
46
+ def summary(self, started: str, finished: str, stats: dict, per_source: dict,
47
+ mirror: dict, source_urls: dict, bytes_downloaded: int, articles_downloaded: int):
48
+ new_dl = stats.get("verified", 0) + stats.get("downloaded", 0)
49
+ print("litsync run complete")
50
+ print(f" started: {started}")
51
+ print(f" finished: {finished}")
52
+ print(f" newly downloaded: {new_dl}, skipped: {stats.get('skipped', 0)}, failed: {stats.get('failed', 0)}")
53
+ print(f" bytes: {human_bytes(bytes_downloaded)}, articles: {articles_downloaded:,}")
54
+
55
+
56
+ class _NoopProgress:
57
+ def __init__(self, total: int):
58
+ self.total = total
59
+
60
+ def advance(self, n: int = 1):
61
+ pass
62
+
63
+ def set_description(self, desc: str):
64
+ pass
65
+
66
+
67
+ class _NoopDownloadProgress:
68
+ def add_task(self, description: str, total: Optional[int]) -> int:
69
+ return 0
70
+
71
+ def update(self, task_id: int, advance: int = 0, completed: Optional[int] = None,
72
+ description: Optional[str] = None):
73
+ pass
74
+
75
+
76
+ class RichUI(UI):
77
+ """Rich-based progress bars and tables."""
78
+
79
+ def planning(self, name: str):
80
+ console.print(f"[cyan]Planning[/cyan] {name} ...")
81
+
82
+ def planned(self, total: int, sources: int):
83
+ console.print(f"[green]Planned {total} files across {sources} source groups[/green]")
84
+
85
+ @contextlib.contextmanager
86
+ def metadata_progress(self, total: int):
87
+ progress = Progress(
88
+ SpinnerColumn(),
89
+ TextColumn("[bold blue]{task.description}"),
90
+ BarColumn(),
91
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
92
+ TextColumn("({task.completed}/{task.total})"),
93
+ console=console,
94
+ transient=True,
95
+ )
96
+ task = progress.add_task("Checking metadata...", total=total)
97
+ try:
98
+ with progress:
99
+ yield _RichProgress(task, progress)
100
+ finally:
101
+ pass
102
+
103
+ @contextlib.contextmanager
104
+ def download_progress(self):
105
+ progress = Progress(
106
+ SpinnerColumn(),
107
+ TextColumn("[bold blue]{task.description}"),
108
+ BarColumn(bar_width=40),
109
+ "[progress.percentage]{task.percentage:>3.0f}%",
110
+ " ",
111
+ DownloadColumn(),
112
+ " ",
113
+ TransferSpeedColumn(),
114
+ " ",
115
+ TimeRemainingColumn(),
116
+ console=console,
117
+ )
118
+ try:
119
+ with progress:
120
+ yield _RichDownloadProgress(progress)
121
+ finally:
122
+ pass
123
+
124
+ def extract(self, rel_path: str):
125
+ console.print(f"[yellow]Extracting[/yellow] {rel_path}")
126
+
127
+ def summary(self, started: str, finished: str, stats: dict, per_source: dict,
128
+ mirror: dict, source_urls: dict, bytes_downloaded: int, articles_downloaded: int):
129
+ new_dl = stats.get("verified", 0) + stats.get("downloaded", 0)
130
+ table = Table(title="litsync summary", show_header=True, header_style="bold magenta")
131
+ table.add_column("Source", style="cyan")
132
+ table.add_column("Files", justify="right")
133
+ table.add_column("Size", justify="right")
134
+ table.add_column("Records", justify="right")
135
+ table.add_column("Run", justify="left")
136
+
137
+ labels = {
138
+ "pubmed": "PubMed",
139
+ "pmc": "PubMed Central",
140
+ "fda": "openFDA",
141
+ "clinicaltrials": "ClinicalTrials.gov",
142
+ }
143
+
144
+ families: dict[str, list[str]] = {}
145
+ sources = sorted(set(mirror) | set(per_source))
146
+ for source in sources:
147
+ families.setdefault(source.split("_", 1)[0], []).append(source)
148
+
149
+ grand_files = grand_bytes = grand_articles = grand_counted = 0
150
+ for fam in sorted(families):
151
+ fam_files = fam_bytes = fam_articles = fam_counted = 0
152
+ for source in families[fam]:
153
+ m = mirror.get(source, {"files": 0, "bytes": 0, "articles": 0, "counted": 0})
154
+ r = per_source.get(source, new_src_stats())
155
+ got = r.get("verified", 0) + r.get("downloaded", 0)
156
+ if m["counted"]:
157
+ art = f"{m['articles']:,}"
158
+ if m["counted"] < m["files"]:
159
+ art += f" (counted {m['counted']}/{m['files']})"
160
+ else:
161
+ art = "—"
162
+ run_txt = f"+{r.get('new', 0)} new, {got} fetched, {r.get('skipped', 0)} current, {r.get('failed', 0)} failed"
163
+ table.add_row(source, str(m["files"]), human_bytes(m["bytes"]), art, run_txt)
164
+ fam_files += m["files"]
165
+ fam_bytes += m["bytes"]
166
+ fam_articles += m["articles"]
167
+ fam_counted += m["counted"]
168
+ table.add_row(
169
+ f"[bold]{labels.get(fam, fam)} subtotal[/bold]",
170
+ str(fam_files),
171
+ human_bytes(fam_bytes),
172
+ f"{fam_articles:,}{partial_note(fam_counted, fam_files)}",
173
+ "",
174
+ )
175
+ grand_files += fam_files
176
+ grand_bytes += fam_bytes
177
+ grand_articles += fam_articles
178
+ grand_counted += fam_counted
179
+
180
+ table.add_row(
181
+ "[bold]TOTAL[/bold]",
182
+ str(grand_files),
183
+ human_bytes(grand_bytes),
184
+ f"{grand_articles:,}{partial_note(grand_counted, grand_files)}",
185
+ f"+{new_dl} downloaded, {stats.get('skipped', 0)} skipped, {stats.get('failed', 0)} failed",
186
+ style="bold green",
187
+ )
188
+ console.print()
189
+ console.print(table)
190
+ console.print(
191
+ f"[dim]started: {started} · finished: {finished} · "
192
+ f" bytes this run: {human_bytes(bytes_downloaded)} · "
193
+ f" records this run: {articles_downloaded:,}[/dim]"
194
+ )
195
+ if grand_counted < grand_files:
196
+ console.print(
197
+ "[dim]Run --count-articles to backfill record counts for already-downloaded files.[/dim]"
198
+ )
199
+
200
+
201
+ class _RichProgress:
202
+ def __init__(self, task: TaskID, progress: Progress):
203
+ self.task = task
204
+ self.progress = progress
205
+
206
+ def advance(self, n: int = 1):
207
+ self.progress.advance(self.task, n)
208
+
209
+ def set_description(self, desc: str):
210
+ self.progress.update(self.task, description=desc)
211
+
212
+
213
+ class _RichDownloadProgress:
214
+ def __init__(self, progress: Progress):
215
+ self.progress = progress
216
+
217
+ def add_task(self, description: str, total: Optional[int]) -> int:
218
+ return self.progress.add_task(description, total=total)
219
+
220
+ def update(self, task_id: int, advance: int = 0, completed: Optional[int] = None,
221
+ description: Optional[str] = None):
222
+ kwargs = {"advance": advance}
223
+ if completed is not None:
224
+ kwargs["completed"] = completed
225
+ if description is not None:
226
+ kwargs["description"] = description
227
+ self.progress.update(task_id, **kwargs)
228
+
229
+
230
+ def new_src_stats() -> dict:
231
+ return {"new": 0, "existing": 0, "skipped": 0,
232
+ "downloaded": 0, "verified": 0, "failed": 0, "bytes": 0, "articles": 0}
litsync/utils.py ADDED
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import datetime as dt
5
+ import fcntl
6
+ import gzip
7
+ import hashlib
8
+ import logging
9
+ import os
10
+ import re
11
+ import tarfile
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ LOG = logging.getLogger("litsync")
16
+ CHUNK = 1 << 20 # 1 MiB
17
+
18
+
19
+ def utcnow() -> str:
20
+ return dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds")
21
+
22
+
23
+ def new_src_stats() -> dict:
24
+ return {"new": 0, "existing": 0, "skipped": 0,
25
+ "downloaded": 0, "verified": 0, "failed": 0, "bytes": 0, "articles": 0}
26
+
27
+
28
+ def human_bytes(n) -> str:
29
+ size = float(n or 0)
30
+ for unit in ("B", "KB", "MB", "GB", "TB"):
31
+ if size < 1024:
32
+ return f"{int(size)} {unit}" if unit == "B" else f"{size:.1f} {unit}"
33
+ size /= 1024
34
+ return f"{size:.1f} PB"
35
+
36
+
37
+ def partial_note(counted: int, files: int) -> str:
38
+ return "" if counted >= files else f" [partial: {counted}/{files} files counted]"
39
+
40
+
41
+ def md5_file(path: Path) -> str:
42
+ h = hashlib.md5()
43
+ with open(path, "rb") as fh:
44
+ for chunk in iter(lambda: fh.read(CHUNK), b""):
45
+ h.update(chunk)
46
+ return h.hexdigest()
47
+
48
+
49
+ def parse_md5(text: str) -> str:
50
+ """NCBI .md5 sidecars look like 'MD5(file.xml.gz)= <hex>' or '<hex> file'."""
51
+ m = re.search(r"=\s*([0-9a-fA-F]{32})", text)
52
+ if m:
53
+ return m.group(1)
54
+ m = re.search(r"\b([0-9a-fA-F]{32})\b", text)
55
+ if not m:
56
+ raise ValueError(f"could not parse md5 from: {text[:120]!r}")
57
+ return m.group(1)
58
+
59
+
60
+ @contextlib.contextmanager
61
+ def run_lock(path: Path):
62
+ """Prevent overlapping runs via an exclusive file lock."""
63
+ path.parent.mkdir(parents=True, exist_ok=True)
64
+ fh = open(path, "w")
65
+ try:
66
+ try:
67
+ fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
68
+ except BlockingIOError:
69
+ raise SystemExit("another litsync run is already in progress; exiting")
70
+ fh.write(f"{os.getpid()} {utcnow()}\n")
71
+ fh.flush()
72
+ yield
73
+ finally:
74
+ with contextlib.suppress(Exception):
75
+ fcntl.flock(fh, fcntl.LOCK_UN)
76
+ fh.close()
77
+
78
+
79
+ def count_articles(path: Path, source: str) -> int:
80
+ """Cheaply count the articles/records inside one downloaded file."""
81
+ name = path.name.lower()
82
+ if source.startswith("pubmed"):
83
+ if not name.endswith(".xml.gz"):
84
+ return 0
85
+ needle = b"<PubmedArticle>"
86
+ overlap = b""
87
+ count = 0
88
+ with gzip.open(path, "rb") as fh:
89
+ for chunk in iter(lambda: fh.read(CHUNK), b""):
90
+ buf = overlap + chunk
91
+ count += buf.count(needle)
92
+ overlap = buf[-(len(needle) - 1):]
93
+ return count
94
+ if source.startswith("pmc"):
95
+ if not name.endswith(".tar.gz"):
96
+ return 0
97
+ count = 0
98
+ with tarfile.open(path, mode="r|gz") as tar:
99
+ for member in tar:
100
+ low = member.name.lower()
101
+ if member.isfile() and (low.endswith(".xml") or low.endswith(".nxml")):
102
+ count += 1
103
+ return count
104
+ if source.startswith("fda"):
105
+ if not name.endswith(".zip"):
106
+ return 0
107
+ import json
108
+ extract_dir = path.parent / name[:-9] if name.endswith(".json.zip") else path.with_suffix("")
109
+ count = 0
110
+ for json_file in extract_dir.rglob("*.json"):
111
+ try:
112
+ data = json.loads(json_file.read_text(encoding="utf-8"))
113
+ count += len(data.get("results", []))
114
+ except Exception:
115
+ pass
116
+ return count
117
+ if source.startswith("clinicaltrials"):
118
+ if not name.endswith(".zip"):
119
+ return 0
120
+ extract_dir = path.with_suffix("")
121
+ return sum(1 for p in extract_dir.rglob("*.xml") if p.is_file())
122
+ return 0