synthneura 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synthneura/__init__.py +0 -0
- synthneura/core/config.py +44 -0
- synthneura/core/logger.py +46 -0
- synthneura/core/schemas.py +47 -0
- synthneura/ingestion/clinical_trials.py +73 -0
- synthneura/services/pipeline.py +141 -0
- synthneura/services/summary.py +60 -0
- synthneura/storage/__init__.py +10 -0
- synthneura/storage/base.py +36 -0
- synthneura/storage/sqlite_sink.py +552 -0
- synthneura/ui/cli.py +282 -0
- synthneura-2.0.0.dist-info/METADATA +8 -0
- synthneura-2.0.0.dist-info/RECORD +15 -0
- synthneura-2.0.0.dist-info/WHEEL +5 -0
- synthneura-2.0.0.dist-info/top_level.txt +1 -0
synthneura/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class Settings:
|
|
9
|
+
db_path: str
|
|
10
|
+
sink: str
|
|
11
|
+
log_level: str
|
|
12
|
+
output_path: Optional[str]
|
|
13
|
+
output_format: str
|
|
14
|
+
summary_path: Optional[str]
|
|
15
|
+
changes_path: Optional[str]
|
|
16
|
+
max_results: int
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@lru_cache(maxsize=1)
|
|
20
|
+
def get_settings() -> Settings:
|
|
21
|
+
db_path = os.getenv("SYNTHNEURA_DB_PATH", "trials.db")
|
|
22
|
+
sink = os.getenv("SYNTHNEURA_SINK", "sqlite")
|
|
23
|
+
log_level = os.getenv("SYNTHNEURA_LOG_LEVEL", "INFO")
|
|
24
|
+
output_path = os.getenv("SYNTHNEURA_OUTPUT_PATH")
|
|
25
|
+
output_format = os.getenv("SYNTHNEURA_OUTPUT_FORMAT", "json")
|
|
26
|
+
summary_path = os.getenv("SYNTHNEURA_SUMMARY_PATH")
|
|
27
|
+
changes_path = os.getenv("SYNTHNEURA_CHANGES_PATH")
|
|
28
|
+
max_results_raw = os.getenv("SYNTHNEURA_MAX_RESULTS", "10")
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
max_results = int(max_results_raw)
|
|
32
|
+
except ValueError:
|
|
33
|
+
max_results = 10
|
|
34
|
+
|
|
35
|
+
return Settings(
|
|
36
|
+
db_path=db_path,
|
|
37
|
+
sink=sink,
|
|
38
|
+
log_level=log_level,
|
|
39
|
+
output_path=output_path,
|
|
40
|
+
output_format=output_format,
|
|
41
|
+
summary_path=summary_path,
|
|
42
|
+
changes_path=changes_path,
|
|
43
|
+
max_results=max_results,
|
|
44
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_logger(name: str) -> logging.Logger:
|
|
7
|
+
"""
|
|
8
|
+
Create or retrieve a configured logger.
|
|
9
|
+
"""
|
|
10
|
+
logger = logging.getLogger(name)
|
|
11
|
+
|
|
12
|
+
if logger.handlers:
|
|
13
|
+
# Logger already configured
|
|
14
|
+
return logger
|
|
15
|
+
|
|
16
|
+
# Inherit root level by default so CLI/user config controls verbosity.
|
|
17
|
+
logger.setLevel(logging.NOTSET)
|
|
18
|
+
|
|
19
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
20
|
+
handler.setLevel(logging.NOTSET)
|
|
21
|
+
formatter = logging.Formatter(
|
|
22
|
+
"%(asctime)s | %(levelname)s | %(name)s | %(message)s"
|
|
23
|
+
)
|
|
24
|
+
handler.setFormatter(formatter)
|
|
25
|
+
|
|
26
|
+
logger.addHandler(handler)
|
|
27
|
+
logger.propagate = False
|
|
28
|
+
|
|
29
|
+
return logger
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def set_log_level(level: Any) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Set the log level for root + existing module loggers.
|
|
35
|
+
"""
|
|
36
|
+
if isinstance(level, str):
|
|
37
|
+
resolved = getattr(logging, level.upper(), logging.INFO)
|
|
38
|
+
else:
|
|
39
|
+
resolved = int(level)
|
|
40
|
+
|
|
41
|
+
logging.getLogger().setLevel(resolved)
|
|
42
|
+
for logger in logging.Logger.manager.loggerDict.values():
|
|
43
|
+
if isinstance(logger, logging.Logger):
|
|
44
|
+
logger.setLevel(resolved)
|
|
45
|
+
for handler in logger.handlers:
|
|
46
|
+
handler.setLevel(resolved)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ClinicalTrial(BaseModel):
|
|
7
|
+
"""
|
|
8
|
+
Canonical internal representation of a clinical trial
|
|
9
|
+
normalized from ClinicalTrials.gov (API v2 or legacy).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
# Core identifiers
|
|
13
|
+
nct_id: str
|
|
14
|
+
|
|
15
|
+
# Descriptive fields (often missing in real-world data)
|
|
16
|
+
title: Optional[str] = None
|
|
17
|
+
status: Optional[str] = None
|
|
18
|
+
phase: Optional[str] = None
|
|
19
|
+
sponsor: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
# Lists are defaulted to empty to avoid runtime errors
|
|
22
|
+
conditions: List[str] = []
|
|
23
|
+
interventions: List[str] = []
|
|
24
|
+
outcomes: List[str] = []
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TopCount(BaseModel):
|
|
28
|
+
value: str
|
|
29
|
+
count: int
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Summary(BaseModel):
|
|
33
|
+
"""
|
|
34
|
+
Aggregated summary statistics for a set of trials.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
total_trials: int
|
|
38
|
+
status_counts: Dict[str, int] = Field(default_factory=dict)
|
|
39
|
+
phase_counts: Dict[str, int] = Field(default_factory=dict)
|
|
40
|
+
top_conditions: List[TopCount] = Field(default_factory=list)
|
|
41
|
+
top_sponsors: List[TopCount] = Field(default_factory=list)
|
|
42
|
+
top_interventions: List[TopCount] = Field(default_factory=list)
|
|
43
|
+
change_counts: Dict[str, int] = Field(default_factory=dict)
|
|
44
|
+
run_id: Optional[str] = None
|
|
45
|
+
ingested_at: Optional[str] = None
|
|
46
|
+
source: Optional[str] = None
|
|
47
|
+
query: Optional[str] = None
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from synthneura.core.logger import get_logger
|
|
6
|
+
|
|
7
|
+
CTG_V2_STUDIES_URL = "https://clinicaltrials.gov/api/v2/studies"
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def fetch_trials(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
|
13
|
+
"""
|
|
14
|
+
Fetch clinical trials from ClinicalTrials.gov API v2.
|
|
15
|
+
Returns a list of study objects (dicts).
|
|
16
|
+
"""
|
|
17
|
+
logger.info("Fetching ClinicalTrials.gov trials")
|
|
18
|
+
logger.debug("Query=%s max_results=%s", query, max_results)
|
|
19
|
+
|
|
20
|
+
params: dict[str, str] = {
|
|
21
|
+
"query.term": query,
|
|
22
|
+
"pageSize": str(max_results),
|
|
23
|
+
"format": "json",
|
|
24
|
+
"countTotal": "true",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
response = requests.get(
|
|
29
|
+
CTG_V2_STUDIES_URL,
|
|
30
|
+
params=params,
|
|
31
|
+
timeout=30,
|
|
32
|
+
)
|
|
33
|
+
except requests.RequestException as exc:
|
|
34
|
+
logger.exception("HTTP request to ClinicalTrials.gov failed")
|
|
35
|
+
raise RuntimeError("Failed to contact ClinicalTrials.gov") from exc
|
|
36
|
+
|
|
37
|
+
logger.info(
|
|
38
|
+
"ClinicalTrials.gov response status=%s url=%s",
|
|
39
|
+
response.status_code,
|
|
40
|
+
response.url,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
content_type = response.headers.get("Content-Type", "")
|
|
44
|
+
if "text/html" in content_type.lower():
|
|
45
|
+
logger.error(
|
|
46
|
+
"Received HTML instead of JSON (status=%s). Response preview=%s",
|
|
47
|
+
response.status_code,
|
|
48
|
+
response.text[:200],
|
|
49
|
+
)
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"ClinicalTrials.gov returned HTML (not JSON). "
|
|
52
|
+
f"Status={response.status_code}. Check endpoint/params."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
response.raise_for_status()
|
|
57
|
+
except requests.HTTPError:
|
|
58
|
+
logger.error(
|
|
59
|
+
"HTTP error from ClinicalTrials.gov status=%s body=%s",
|
|
60
|
+
response.status_code,
|
|
61
|
+
response.text[:200],
|
|
62
|
+
)
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
data = response.json()
|
|
66
|
+
studies = data.get("studies", [])
|
|
67
|
+
|
|
68
|
+
if not studies:
|
|
69
|
+
logger.warning("No trials found for query=%s", query)
|
|
70
|
+
raise ValueError(f"No trials found for query: {query}")
|
|
71
|
+
|
|
72
|
+
logger.info("Fetched %d trials from ClinicalTrials.gov", len(studies))
|
|
73
|
+
return studies
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from synthneura.core.logger import get_logger
|
|
4
|
+
from synthneura.core.schemas import ClinicalTrial
|
|
5
|
+
|
|
6
|
+
logger = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _as_list(x: Any) -> List[str]:
|
|
10
|
+
"""Coerce strings / lists / missing values into a list[str]."""
|
|
11
|
+
if x is None:
|
|
12
|
+
return []
|
|
13
|
+
if isinstance(x, str):
|
|
14
|
+
return [x]
|
|
15
|
+
if isinstance(x, list):
|
|
16
|
+
out: List[str] = []
|
|
17
|
+
for item in x:
|
|
18
|
+
if item is None:
|
|
19
|
+
continue
|
|
20
|
+
if isinstance(item, str):
|
|
21
|
+
out.append(item)
|
|
22
|
+
elif isinstance(item, dict):
|
|
23
|
+
# Try common name fields
|
|
24
|
+
name = item.get("name") or item.get("measure") or item.get("title")
|
|
25
|
+
if name:
|
|
26
|
+
out.append(str(name))
|
|
27
|
+
else:
|
|
28
|
+
out.append(str(item))
|
|
29
|
+
return out
|
|
30
|
+
if isinstance(x, dict):
|
|
31
|
+
# legacy containers like {"Condition": [...]}
|
|
32
|
+
for key in ("Condition", "Intervention", "PrimaryOutcome"):
|
|
33
|
+
if key in x:
|
|
34
|
+
return _as_list(x.get(key))
|
|
35
|
+
# fallback: stringify dict
|
|
36
|
+
return [str(x)]
|
|
37
|
+
return [str(x)]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def normalize_trial(raw_trial: Dict[str, Any]) -> ClinicalTrial:
|
|
41
|
+
"""
|
|
42
|
+
Normalize raw trial data into the ClinicalTrial schema.
|
|
43
|
+
Supports:
|
|
44
|
+
- ClinicalTrials.gov API v2 (preferred)
|
|
45
|
+
- Legacy v1 'FullStudies' shape (fallback)
|
|
46
|
+
"""
|
|
47
|
+
# --- Detect v1 wrapper ---
|
|
48
|
+
if "Study" in raw_trial:
|
|
49
|
+
ps = raw_trial["Study"]["ProtocolSection"]
|
|
50
|
+
|
|
51
|
+
nct_id = ps["IdentificationModule"]["NCTId"]
|
|
52
|
+
title = ps["IdentificationModule"].get("BriefTitle") or ps[
|
|
53
|
+
"IdentificationModule"
|
|
54
|
+
].get("OfficialTitle")
|
|
55
|
+
status = ps.get("StatusModule", {}).get("OverallStatus")
|
|
56
|
+
phase = ps.get("DesignModule", {}).get("Phase")
|
|
57
|
+
sponsor = ps.get("SponsorModule", {}).get("LeadSponsor", {}).get("Name")
|
|
58
|
+
|
|
59
|
+
conditions = _as_list(
|
|
60
|
+
ps.get("ConditionsModule", {}).get("ConditionList", {}).get("Condition")
|
|
61
|
+
)
|
|
62
|
+
interventions = _as_list(
|
|
63
|
+
ps.get("ArmsInterventionsModule", {})
|
|
64
|
+
.get("InterventionList", {})
|
|
65
|
+
.get("Intervention")
|
|
66
|
+
)
|
|
67
|
+
outcomes = _as_list(
|
|
68
|
+
ps.get("OutcomesModule", {})
|
|
69
|
+
.get("PrimaryOutcomeList", {})
|
|
70
|
+
.get("PrimaryOutcome")
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
logger.debug(
|
|
74
|
+
"Normalized v1 trial nct_id=%s title=%s", nct_id, (title or "")[:60]
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return ClinicalTrial(
|
|
78
|
+
nct_id=nct_id,
|
|
79
|
+
title=title,
|
|
80
|
+
status=status,
|
|
81
|
+
phase=phase,
|
|
82
|
+
sponsor=sponsor,
|
|
83
|
+
conditions=conditions,
|
|
84
|
+
interventions=interventions,
|
|
85
|
+
outcomes=outcomes,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# --- v2 path ---
|
|
89
|
+
ps = raw_trial.get("protocolSection", {}) or {}
|
|
90
|
+
|
|
91
|
+
ident = ps.get("identificationModule", {}) or {}
|
|
92
|
+
status_mod = ps.get("statusModule", {}) or {}
|
|
93
|
+
design_mod = ps.get("designModule", {}) or {}
|
|
94
|
+
sponsor_mod = ps.get("sponsorsModule", {}) or {}
|
|
95
|
+
cond_mod = ps.get("conditionsModule", {}) or {}
|
|
96
|
+
arms_mod = ps.get("armsInterventionsModule", {}) or {}
|
|
97
|
+
outcomes_mod = ps.get("outcomesModule", {}) or {}
|
|
98
|
+
|
|
99
|
+
nct_id = ident.get("nctId")
|
|
100
|
+
title = ident.get("briefTitle") or ident.get("officialTitle")
|
|
101
|
+
status = status_mod.get("overallStatus")
|
|
102
|
+
|
|
103
|
+
phases = design_mod.get("phases")
|
|
104
|
+
phase = ", ".join(_as_list(phases)) if phases else None
|
|
105
|
+
|
|
106
|
+
sponsor = (sponsor_mod.get("leadSponsor", {}) or {}).get("name")
|
|
107
|
+
|
|
108
|
+
conditions = _as_list(cond_mod.get("conditions"))
|
|
109
|
+
interventions = _as_list(arms_mod.get("interventions"))
|
|
110
|
+
|
|
111
|
+
primary_outcomes = outcomes_mod.get("primaryOutcomes")
|
|
112
|
+
outcomes = _as_list(primary_outcomes)
|
|
113
|
+
|
|
114
|
+
if not nct_id:
|
|
115
|
+
logger.error(
|
|
116
|
+
"Missing nctId in trial record. Keys=%s",
|
|
117
|
+
list(raw_trial.keys()),
|
|
118
|
+
)
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"Missing nctId in trial record "
|
|
121
|
+
"(unexpected ClinicalTrials.gov response shape)."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Helpful warnings (non-fatal)
|
|
125
|
+
if not title:
|
|
126
|
+
logger.warning("Missing title for nct_id=%s", nct_id)
|
|
127
|
+
if not status:
|
|
128
|
+
logger.warning("Missing status for nct_id=%s", nct_id)
|
|
129
|
+
|
|
130
|
+
logger.debug("Normalized v2 trial nct_id=%s title=%s", nct_id, (title or "")[:60])
|
|
131
|
+
|
|
132
|
+
return ClinicalTrial(
|
|
133
|
+
nct_id=nct_id,
|
|
134
|
+
title=title,
|
|
135
|
+
status=status,
|
|
136
|
+
phase=phase,
|
|
137
|
+
sponsor=sponsor,
|
|
138
|
+
conditions=conditions,
|
|
139
|
+
interventions=interventions,
|
|
140
|
+
outcomes=outcomes,
|
|
141
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
3
|
+
|
|
4
|
+
from synthneura.core.schemas import Summary, TopCount
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _as_str_list(values: Iterable[Any]) -> List[str]:
|
|
8
|
+
out: List[str] = []
|
|
9
|
+
for value in values:
|
|
10
|
+
if value is None:
|
|
11
|
+
continue
|
|
12
|
+
if isinstance(value, str):
|
|
13
|
+
if value.strip():
|
|
14
|
+
out.append(value.strip())
|
|
15
|
+
continue
|
|
16
|
+
if isinstance(value, list):
|
|
17
|
+
out.extend(_as_str_list(value))
|
|
18
|
+
continue
|
|
19
|
+
out.append(str(value))
|
|
20
|
+
return out
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _top_counts(values: Iterable[str], limit: int = 5) -> List[TopCount]:
|
|
24
|
+
counts = Counter(values)
|
|
25
|
+
return [
|
|
26
|
+
TopCount(value=value, count=count) for value, count in counts.most_common(limit)
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def summarize_trials(
|
|
31
|
+
trials: List[Dict[str, Any]],
|
|
32
|
+
*,
|
|
33
|
+
run_meta: Optional[Dict[str, str]] = None,
|
|
34
|
+
change_counts: Optional[Dict[str, int]] = None,
|
|
35
|
+
) -> Summary:
|
|
36
|
+
statuses = _as_str_list(trial.get("status") for trial in trials)
|
|
37
|
+
phases = _as_str_list(trial.get("phase") for trial in trials)
|
|
38
|
+
sponsors = _as_str_list(trial.get("sponsor") for trial in trials)
|
|
39
|
+
|
|
40
|
+
conditions: List[str] = []
|
|
41
|
+
interventions: List[str] = []
|
|
42
|
+
for trial in trials:
|
|
43
|
+
conditions.extend(_as_str_list(trial.get("conditions", [])))
|
|
44
|
+
interventions.extend(_as_str_list(trial.get("interventions", [])))
|
|
45
|
+
|
|
46
|
+
run_meta = run_meta or {}
|
|
47
|
+
|
|
48
|
+
return Summary(
|
|
49
|
+
total_trials=len(trials),
|
|
50
|
+
status_counts=dict(Counter(statuses)),
|
|
51
|
+
phase_counts=dict(Counter(phases)),
|
|
52
|
+
top_conditions=_top_counts(conditions),
|
|
53
|
+
top_sponsors=_top_counts(sponsors),
|
|
54
|
+
top_interventions=_top_counts(interventions),
|
|
55
|
+
change_counts=change_counts or {},
|
|
56
|
+
run_id=run_meta.get("run_id"),
|
|
57
|
+
ingested_at=run_meta.get("ingested_at"),
|
|
58
|
+
source=run_meta.get("source"),
|
|
59
|
+
query=run_meta.get("query"),
|
|
60
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from synthneura.core.schemas import ClinicalTrial
|
|
6
|
+
|
|
7
|
+
RunMeta = Dict[str, str]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class ChangeRecord:
|
|
12
|
+
nct_id: str
|
|
13
|
+
change_status: str
|
|
14
|
+
changed_fields: List[str]
|
|
15
|
+
before: Dict[str, Any]
|
|
16
|
+
after: Dict[str, Any]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class StoreResult:
|
|
21
|
+
counts: Dict[str, int]
|
|
22
|
+
changes: List[ChangeRecord]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TrialSink(ABC):
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def store_trials(
|
|
28
|
+
self,
|
|
29
|
+
trials: List[ClinicalTrial],
|
|
30
|
+
raw_trials: List[Dict[str, Any]],
|
|
31
|
+
run_meta: RunMeta,
|
|
32
|
+
) -> StoreResult:
|
|
33
|
+
"""
|
|
34
|
+
Persist trials and return counts for new/updated/unchanged/failed.
|
|
35
|
+
"""
|
|
36
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,552 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sqlite3
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from synthneura.core.logger import get_logger
|
|
7
|
+
from synthneura.core.schemas import ClinicalTrial
|
|
8
|
+
from synthneura.storage.base import ChangeRecord, RunMeta, StoreResult, TrialSink
|
|
9
|
+
|
|
10
|
+
logger = get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _utc_now() -> str:
|
|
14
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _ensure_schema(conn: sqlite3.Connection) -> None:
|
|
18
|
+
cursor = conn.cursor()
|
|
19
|
+
cursor.execute(
|
|
20
|
+
"""
|
|
21
|
+
CREATE TABLE IF NOT EXISTS clinical_trials (
|
|
22
|
+
nct_id TEXT PRIMARY KEY,
|
|
23
|
+
title TEXT,
|
|
24
|
+
status TEXT,
|
|
25
|
+
phase TEXT,
|
|
26
|
+
sponsor TEXT,
|
|
27
|
+
conditions TEXT,
|
|
28
|
+
interventions TEXT,
|
|
29
|
+
outcomes TEXT,
|
|
30
|
+
raw_json TEXT,
|
|
31
|
+
source TEXT,
|
|
32
|
+
query TEXT,
|
|
33
|
+
ingested_at TEXT,
|
|
34
|
+
run_id TEXT
|
|
35
|
+
)
|
|
36
|
+
"""
|
|
37
|
+
)
|
|
38
|
+
conn.commit()
|
|
39
|
+
|
|
40
|
+
cursor.execute("PRAGMA table_info(clinical_trials)")
|
|
41
|
+
existing = {row[1] for row in cursor.fetchall()}
|
|
42
|
+
expected = {
|
|
43
|
+
"raw_json",
|
|
44
|
+
"source",
|
|
45
|
+
"query",
|
|
46
|
+
"ingested_at",
|
|
47
|
+
"run_id",
|
|
48
|
+
}
|
|
49
|
+
missing = expected - existing
|
|
50
|
+
for column in sorted(missing):
|
|
51
|
+
cursor.execute(f"ALTER TABLE clinical_trials ADD COLUMN {column} TEXT")
|
|
52
|
+
if missing:
|
|
53
|
+
conn.commit()
|
|
54
|
+
|
|
55
|
+
cursor.execute(
|
|
56
|
+
"""
|
|
57
|
+
CREATE TABLE IF NOT EXISTS clinical_trials_history (
|
|
58
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
59
|
+
nct_id TEXT,
|
|
60
|
+
title TEXT,
|
|
61
|
+
status TEXT,
|
|
62
|
+
phase TEXT,
|
|
63
|
+
sponsor TEXT,
|
|
64
|
+
conditions TEXT,
|
|
65
|
+
interventions TEXT,
|
|
66
|
+
outcomes TEXT,
|
|
67
|
+
raw_json TEXT,
|
|
68
|
+
source TEXT,
|
|
69
|
+
query TEXT,
|
|
70
|
+
ingested_at TEXT,
|
|
71
|
+
run_id TEXT,
|
|
72
|
+
valid_from TEXT,
|
|
73
|
+
valid_to TEXT
|
|
74
|
+
)
|
|
75
|
+
"""
|
|
76
|
+
)
|
|
77
|
+
conn.commit()
|
|
78
|
+
|
|
79
|
+
cursor.execute("PRAGMA table_info(clinical_trials_history)")
|
|
80
|
+
existing_history = {row[1] for row in cursor.fetchall()}
|
|
81
|
+
expected_history = {
|
|
82
|
+
"nct_id",
|
|
83
|
+
"title",
|
|
84
|
+
"status",
|
|
85
|
+
"phase",
|
|
86
|
+
"sponsor",
|
|
87
|
+
"conditions",
|
|
88
|
+
"interventions",
|
|
89
|
+
"outcomes",
|
|
90
|
+
"raw_json",
|
|
91
|
+
"source",
|
|
92
|
+
"query",
|
|
93
|
+
"ingested_at",
|
|
94
|
+
"run_id",
|
|
95
|
+
"valid_from",
|
|
96
|
+
"valid_to",
|
|
97
|
+
}
|
|
98
|
+
missing_history = expected_history - existing_history
|
|
99
|
+
for column in sorted(missing_history):
|
|
100
|
+
cursor.execute(f"ALTER TABLE clinical_trials_history ADD COLUMN {column} TEXT")
|
|
101
|
+
if missing_history:
|
|
102
|
+
conn.commit()
|
|
103
|
+
|
|
104
|
+
cursor.execute(
|
|
105
|
+
"""
|
|
106
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
107
|
+
run_id TEXT PRIMARY KEY,
|
|
108
|
+
ingested_at TEXT,
|
|
109
|
+
source TEXT,
|
|
110
|
+
query TEXT,
|
|
111
|
+
total INTEGER,
|
|
112
|
+
new_count INTEGER,
|
|
113
|
+
updated_count INTEGER,
|
|
114
|
+
unchanged_count INTEGER,
|
|
115
|
+
failed_count INTEGER
|
|
116
|
+
)
|
|
117
|
+
"""
|
|
118
|
+
)
|
|
119
|
+
conn.commit()
|
|
120
|
+
|
|
121
|
+
cursor.execute("PRAGMA table_info(runs)")
|
|
122
|
+
existing_runs = {row[1] for row in cursor.fetchall()}
|
|
123
|
+
expected_runs = {
|
|
124
|
+
"ingested_at",
|
|
125
|
+
"source",
|
|
126
|
+
"query",
|
|
127
|
+
"total",
|
|
128
|
+
"new_count",
|
|
129
|
+
"updated_count",
|
|
130
|
+
"unchanged_count",
|
|
131
|
+
"failed_count",
|
|
132
|
+
}
|
|
133
|
+
missing_runs = expected_runs - existing_runs
|
|
134
|
+
for column in sorted(missing_runs):
|
|
135
|
+
column_type = "INTEGER"
|
|
136
|
+
if column in {"ingested_at", "source", "query"}:
|
|
137
|
+
column_type = "TEXT"
|
|
138
|
+
cursor.execute(f"ALTER TABLE runs ADD COLUMN {column} {column_type}")
|
|
139
|
+
if missing_runs:
|
|
140
|
+
conn.commit()
|
|
141
|
+
|
|
142
|
+
cursor.execute(
|
|
143
|
+
"""
|
|
144
|
+
CREATE TABLE IF NOT EXISTS trial_diffs (
|
|
145
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
146
|
+
nct_id TEXT,
|
|
147
|
+
run_id TEXT,
|
|
148
|
+
ingested_at TEXT,
|
|
149
|
+
changed_fields TEXT,
|
|
150
|
+
before_json TEXT,
|
|
151
|
+
after_json TEXT
|
|
152
|
+
)
|
|
153
|
+
"""
|
|
154
|
+
)
|
|
155
|
+
conn.commit()
|
|
156
|
+
|
|
157
|
+
cursor.execute("PRAGMA table_info(trial_diffs)")
|
|
158
|
+
existing_diffs = {row[1] for row in cursor.fetchall()}
|
|
159
|
+
expected_diffs = {
|
|
160
|
+
"nct_id",
|
|
161
|
+
"run_id",
|
|
162
|
+
"ingested_at",
|
|
163
|
+
"changed_fields",
|
|
164
|
+
"before_json",
|
|
165
|
+
"after_json",
|
|
166
|
+
}
|
|
167
|
+
missing_diffs = expected_diffs - existing_diffs
|
|
168
|
+
for column in sorted(missing_diffs):
|
|
169
|
+
cursor.execute(f"ALTER TABLE trial_diffs ADD COLUMN {column} TEXT")
|
|
170
|
+
if missing_diffs:
|
|
171
|
+
conn.commit()
|
|
172
|
+
|
|
173
|
+
cursor.execute(
|
|
174
|
+
"""
|
|
175
|
+
CREATE INDEX IF NOT EXISTS idx_clinical_trials_nct_id
|
|
176
|
+
ON clinical_trials(nct_id)
|
|
177
|
+
"""
|
|
178
|
+
)
|
|
179
|
+
cursor.execute(
|
|
180
|
+
"""
|
|
181
|
+
CREATE INDEX IF NOT EXISTS idx_clinical_trials_history_nct_id_valid_from
|
|
182
|
+
ON clinical_trials_history(nct_id, valid_from)
|
|
183
|
+
"""
|
|
184
|
+
)
|
|
185
|
+
cursor.execute(
|
|
186
|
+
"""
|
|
187
|
+
CREATE INDEX IF NOT EXISTS idx_trial_diffs_nct_id_ingested_at
|
|
188
|
+
ON trial_diffs(nct_id, ingested_at)
|
|
189
|
+
"""
|
|
190
|
+
)
|
|
191
|
+
conn.commit()
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _trial_core_values(trial: ClinicalTrial) -> Tuple[Optional[str], ...]:
|
|
195
|
+
return (
|
|
196
|
+
trial.title,
|
|
197
|
+
trial.status,
|
|
198
|
+
trial.phase,
|
|
199
|
+
trial.sponsor,
|
|
200
|
+
",".join(trial.conditions or []),
|
|
201
|
+
",".join(trial.interventions or []),
|
|
202
|
+
",".join(trial.outcomes or []),
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _core_to_dict(core: Tuple[Optional[str], ...]) -> Dict[str, Any]:
|
|
207
|
+
return {
|
|
208
|
+
"title": core[0],
|
|
209
|
+
"status": core[1],
|
|
210
|
+
"phase": core[2],
|
|
211
|
+
"sponsor": core[3],
|
|
212
|
+
"conditions": core[4],
|
|
213
|
+
"interventions": core[5],
|
|
214
|
+
"outcomes": core[6],
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _changed_fields(
|
|
219
|
+
existing: Tuple[Optional[str], ...], updated: Tuple[Optional[str], ...]
|
|
220
|
+
) -> List[str]:
|
|
221
|
+
fields = [
|
|
222
|
+
"title",
|
|
223
|
+
"status",
|
|
224
|
+
"phase",
|
|
225
|
+
"sponsor",
|
|
226
|
+
"conditions",
|
|
227
|
+
"interventions",
|
|
228
|
+
"outcomes",
|
|
229
|
+
]
|
|
230
|
+
changed: List[str] = []
|
|
231
|
+
for idx, field in enumerate(fields):
|
|
232
|
+
existing_value = existing[idx] or ""
|
|
233
|
+
updated_value = updated[idx] or ""
|
|
234
|
+
if existing_value != updated_value:
|
|
235
|
+
changed.append(field)
|
|
236
|
+
return changed
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _raw_nct_id(raw_trial: Dict[str, Any]) -> Optional[str]:
|
|
240
|
+
if "Study" in raw_trial:
|
|
241
|
+
return (
|
|
242
|
+
raw_trial.get("Study", {})
|
|
243
|
+
.get("ProtocolSection", {})
|
|
244
|
+
.get("IdentificationModule", {})
|
|
245
|
+
.get("NCTId")
|
|
246
|
+
)
|
|
247
|
+
return (
|
|
248
|
+
raw_trial.get("protocolSection", {})
|
|
249
|
+
.get("identificationModule", {})
|
|
250
|
+
.get("nctId")
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class SQLiteTrialSink(TrialSink):
|
|
255
|
+
def __init__(self, db_path: str = "trials.db") -> None:
|
|
256
|
+
self.db_path = db_path
|
|
257
|
+
|
|
258
|
+
def fetch_previous_run_time(self, current_run_id: str) -> Optional[str]:
|
|
259
|
+
conn = sqlite3.connect(self.db_path)
|
|
260
|
+
try:
|
|
261
|
+
_ensure_schema(conn)
|
|
262
|
+
cursor = conn.cursor()
|
|
263
|
+
cursor.execute(
|
|
264
|
+
"""
|
|
265
|
+
SELECT ingested_at
|
|
266
|
+
FROM runs
|
|
267
|
+
WHERE run_id != ?
|
|
268
|
+
ORDER BY ingested_at DESC
|
|
269
|
+
LIMIT 1
|
|
270
|
+
""",
|
|
271
|
+
(current_run_id,),
|
|
272
|
+
)
|
|
273
|
+
row = cursor.fetchone()
|
|
274
|
+
finally:
|
|
275
|
+
conn.close()
|
|
276
|
+
if row:
|
|
277
|
+
return row[0]
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
def fetch_diffs_between(
|
|
281
|
+
self, start_time: Optional[str], end_time: Optional[str]
|
|
282
|
+
) -> List[ChangeRecord]:
|
|
283
|
+
if not end_time:
|
|
284
|
+
return []
|
|
285
|
+
conn = sqlite3.connect(self.db_path)
|
|
286
|
+
try:
|
|
287
|
+
_ensure_schema(conn)
|
|
288
|
+
cursor = conn.cursor()
|
|
289
|
+
if start_time:
|
|
290
|
+
cursor.execute(
|
|
291
|
+
"""
|
|
292
|
+
SELECT nct_id, changed_fields, before_json, after_json
|
|
293
|
+
FROM trial_diffs
|
|
294
|
+
WHERE ingested_at > ? AND ingested_at <= ?
|
|
295
|
+
ORDER BY ingested_at DESC
|
|
296
|
+
""",
|
|
297
|
+
(start_time, end_time),
|
|
298
|
+
)
|
|
299
|
+
else:
|
|
300
|
+
cursor.execute(
|
|
301
|
+
"""
|
|
302
|
+
SELECT nct_id, changed_fields, before_json, after_json
|
|
303
|
+
FROM trial_diffs
|
|
304
|
+
WHERE ingested_at <= ?
|
|
305
|
+
ORDER BY ingested_at DESC
|
|
306
|
+
""",
|
|
307
|
+
(end_time,),
|
|
308
|
+
)
|
|
309
|
+
rows = cursor.fetchall()
|
|
310
|
+
finally:
|
|
311
|
+
conn.close()
|
|
312
|
+
|
|
313
|
+
changes: List[ChangeRecord] = []
|
|
314
|
+
for row in rows:
|
|
315
|
+
nct_id, changed_fields_json, before_json, after_json = row
|
|
316
|
+
changes.append(
|
|
317
|
+
ChangeRecord(
|
|
318
|
+
nct_id=nct_id,
|
|
319
|
+
change_status="updated",
|
|
320
|
+
changed_fields=json.loads(changed_fields_json or "[]"),
|
|
321
|
+
before=json.loads(before_json or "{}"),
|
|
322
|
+
after=json.loads(after_json or "{}"),
|
|
323
|
+
)
|
|
324
|
+
)
|
|
325
|
+
return changes
|
|
326
|
+
|
|
327
|
+
def store_trials(
|
|
328
|
+
self,
|
|
329
|
+
trials: List[ClinicalTrial],
|
|
330
|
+
raw_trials: List[Dict[str, Any]],
|
|
331
|
+
run_meta: RunMeta,
|
|
332
|
+
) -> StoreResult:
|
|
333
|
+
logger.debug("Storing %d trials in SQLite db=%s", len(trials), self.db_path)
|
|
334
|
+
|
|
335
|
+
raw_by_id = {}
|
|
336
|
+
for raw_trial in raw_trials:
|
|
337
|
+
nct_id = _raw_nct_id(raw_trial)
|
|
338
|
+
if nct_id:
|
|
339
|
+
raw_by_id[nct_id] = raw_trial
|
|
340
|
+
|
|
341
|
+
counts = {"new": 0, "updated": 0, "unchanged": 0, "failed": 0}
|
|
342
|
+
changes: List[ChangeRecord] = []
|
|
343
|
+
conn = sqlite3.connect(self.db_path)
|
|
344
|
+
try:
|
|
345
|
+
_ensure_schema(conn)
|
|
346
|
+
cursor = conn.cursor()
|
|
347
|
+
for trial in trials:
|
|
348
|
+
cursor.execute(
|
|
349
|
+
"""
|
|
350
|
+
SELECT
|
|
351
|
+
title,
|
|
352
|
+
status,
|
|
353
|
+
phase,
|
|
354
|
+
sponsor,
|
|
355
|
+
conditions,
|
|
356
|
+
interventions,
|
|
357
|
+
outcomes,
|
|
358
|
+
raw_json,
|
|
359
|
+
source,
|
|
360
|
+
query,
|
|
361
|
+
ingested_at,
|
|
362
|
+
run_id
|
|
363
|
+
FROM clinical_trials
|
|
364
|
+
WHERE nct_id = ?
|
|
365
|
+
""",
|
|
366
|
+
(trial.nct_id,),
|
|
367
|
+
)
|
|
368
|
+
existing = cursor.fetchone()
|
|
369
|
+
existing_core = existing[:7] if existing else None
|
|
370
|
+
existing_raw_json = existing[7] if existing else None
|
|
371
|
+
existing_source = existing[8] if existing else None
|
|
372
|
+
existing_query = existing[9] if existing else None
|
|
373
|
+
existing_ingested_at = existing[10] if existing else None
|
|
374
|
+
existing_run_id = existing[11] if existing else None
|
|
375
|
+
new_core = _trial_core_values(trial)
|
|
376
|
+
|
|
377
|
+
if existing is None:
|
|
378
|
+
change_status = "new"
|
|
379
|
+
elif existing_core == new_core:
|
|
380
|
+
change_status = "unchanged"
|
|
381
|
+
else:
|
|
382
|
+
change_status = "updated"
|
|
383
|
+
|
|
384
|
+
ingested_at_value = run_meta.get("ingested_at") or _utc_now()
|
|
385
|
+
if change_status == "unchanged" and existing_ingested_at:
|
|
386
|
+
ingested_at_value = existing_ingested_at
|
|
387
|
+
raw_json = json.dumps(
|
|
388
|
+
raw_by_id.get(trial.nct_id, {}), ensure_ascii=True
|
|
389
|
+
)
|
|
390
|
+
if trial.nct_id not in raw_by_id and existing_raw_json is not None:
|
|
391
|
+
raw_json = existing_raw_json
|
|
392
|
+
if change_status == "updated" and existing is not None:
|
|
393
|
+
changed_fields = _changed_fields(
|
|
394
|
+
existing_core or new_core, new_core
|
|
395
|
+
)
|
|
396
|
+
before = _core_to_dict(existing_core or new_core)
|
|
397
|
+
after = _core_to_dict(new_core)
|
|
398
|
+
changes.append(
|
|
399
|
+
ChangeRecord(
|
|
400
|
+
nct_id=trial.nct_id,
|
|
401
|
+
change_status=change_status,
|
|
402
|
+
changed_fields=changed_fields,
|
|
403
|
+
before=before,
|
|
404
|
+
after=after,
|
|
405
|
+
)
|
|
406
|
+
)
|
|
407
|
+
cursor.execute(
|
|
408
|
+
"""
|
|
409
|
+
INSERT INTO trial_diffs
|
|
410
|
+
(
|
|
411
|
+
nct_id,
|
|
412
|
+
run_id,
|
|
413
|
+
ingested_at,
|
|
414
|
+
changed_fields,
|
|
415
|
+
before_json,
|
|
416
|
+
after_json
|
|
417
|
+
)
|
|
418
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
419
|
+
""",
|
|
420
|
+
(
|
|
421
|
+
trial.nct_id,
|
|
422
|
+
run_meta.get("run_id"),
|
|
423
|
+
ingested_at_value,
|
|
424
|
+
json.dumps(changed_fields, ensure_ascii=True),
|
|
425
|
+
json.dumps(before, ensure_ascii=True),
|
|
426
|
+
json.dumps(after, ensure_ascii=True),
|
|
427
|
+
),
|
|
428
|
+
)
|
|
429
|
+
(
|
|
430
|
+
existing_title,
|
|
431
|
+
existing_status,
|
|
432
|
+
existing_phase,
|
|
433
|
+
existing_sponsor,
|
|
434
|
+
existing_conditions,
|
|
435
|
+
existing_interventions,
|
|
436
|
+
existing_outcomes,
|
|
437
|
+
) = (
|
|
438
|
+
existing_core or new_core
|
|
439
|
+
)
|
|
440
|
+
cursor.execute(
|
|
441
|
+
"""
|
|
442
|
+
INSERT INTO clinical_trials_history
|
|
443
|
+
(
|
|
444
|
+
nct_id,
|
|
445
|
+
title,
|
|
446
|
+
status,
|
|
447
|
+
phase,
|
|
448
|
+
sponsor,
|
|
449
|
+
conditions,
|
|
450
|
+
interventions,
|
|
451
|
+
outcomes,
|
|
452
|
+
raw_json,
|
|
453
|
+
source,
|
|
454
|
+
query,
|
|
455
|
+
ingested_at,
|
|
456
|
+
run_id,
|
|
457
|
+
valid_from,
|
|
458
|
+
valid_to
|
|
459
|
+
)
|
|
460
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
461
|
+
""",
|
|
462
|
+
(
|
|
463
|
+
trial.nct_id,
|
|
464
|
+
existing_title,
|
|
465
|
+
existing_status,
|
|
466
|
+
existing_phase,
|
|
467
|
+
existing_sponsor,
|
|
468
|
+
existing_conditions,
|
|
469
|
+
existing_interventions,
|
|
470
|
+
existing_outcomes,
|
|
471
|
+
existing_raw_json,
|
|
472
|
+
existing_source,
|
|
473
|
+
existing_query,
|
|
474
|
+
existing_ingested_at,
|
|
475
|
+
existing_run_id,
|
|
476
|
+
existing_ingested_at,
|
|
477
|
+
ingested_at_value,
|
|
478
|
+
),
|
|
479
|
+
)
|
|
480
|
+
if change_status != "unchanged":
|
|
481
|
+
cursor.execute(
|
|
482
|
+
"""
|
|
483
|
+
INSERT OR REPLACE INTO clinical_trials
|
|
484
|
+
(
|
|
485
|
+
nct_id,
|
|
486
|
+
title,
|
|
487
|
+
status,
|
|
488
|
+
phase,
|
|
489
|
+
sponsor,
|
|
490
|
+
conditions,
|
|
491
|
+
interventions,
|
|
492
|
+
outcomes,
|
|
493
|
+
raw_json,
|
|
494
|
+
source,
|
|
495
|
+
query,
|
|
496
|
+
ingested_at,
|
|
497
|
+
run_id
|
|
498
|
+
)
|
|
499
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
500
|
+
""",
|
|
501
|
+
(
|
|
502
|
+
trial.nct_id,
|
|
503
|
+
trial.title,
|
|
504
|
+
trial.status,
|
|
505
|
+
trial.phase,
|
|
506
|
+
trial.sponsor,
|
|
507
|
+
",".join(trial.conditions or []),
|
|
508
|
+
",".join(trial.interventions or []),
|
|
509
|
+
",".join(trial.outcomes or []),
|
|
510
|
+
raw_json,
|
|
511
|
+
run_meta.get("source"),
|
|
512
|
+
run_meta.get("query"),
|
|
513
|
+
ingested_at_value,
|
|
514
|
+
run_meta.get("run_id"),
|
|
515
|
+
),
|
|
516
|
+
)
|
|
517
|
+
counts[change_status] += 1
|
|
518
|
+
conn.commit()
|
|
519
|
+
run_id = run_meta.get("run_id")
|
|
520
|
+
if run_id:
|
|
521
|
+
cursor.execute(
|
|
522
|
+
"""
|
|
523
|
+
INSERT OR REPLACE INTO runs
|
|
524
|
+
(
|
|
525
|
+
run_id,
|
|
526
|
+
ingested_at,
|
|
527
|
+
source,
|
|
528
|
+
query,
|
|
529
|
+
total,
|
|
530
|
+
new_count,
|
|
531
|
+
updated_count,
|
|
532
|
+
unchanged_count,
|
|
533
|
+
failed_count
|
|
534
|
+
)
|
|
535
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
536
|
+
""",
|
|
537
|
+
(
|
|
538
|
+
run_id,
|
|
539
|
+
run_meta.get("ingested_at"),
|
|
540
|
+
run_meta.get("source"),
|
|
541
|
+
run_meta.get("query"),
|
|
542
|
+
len(trials),
|
|
543
|
+
counts["new"],
|
|
544
|
+
counts["updated"],
|
|
545
|
+
counts["unchanged"],
|
|
546
|
+
counts["failed"],
|
|
547
|
+
),
|
|
548
|
+
)
|
|
549
|
+
conn.commit()
|
|
550
|
+
finally:
|
|
551
|
+
conn.close()
|
|
552
|
+
return StoreResult(counts=counts, changes=changes)
|
synthneura/ui/cli.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from synthneura.core.config import get_settings
|
|
11
|
+
from synthneura.core.logger import get_logger, set_log_level
|
|
12
|
+
from synthneura.ingestion.clinical_trials import fetch_trials
|
|
13
|
+
from synthneura.services.pipeline import normalize_trial
|
|
14
|
+
from synthneura.services.summary import summarize_trials
|
|
15
|
+
from synthneura.storage.sqlite_sink import SQLiteTrialSink
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _trial_to_dict(trial: Any) -> Dict[str, Any]:
|
|
21
|
+
if hasattr(trial, "model_dump"):
|
|
22
|
+
return trial.model_dump()
|
|
23
|
+
if hasattr(trial, "dict"):
|
|
24
|
+
return trial.dict()
|
|
25
|
+
return dict(trial)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _write_json(path: Path, trials: List[Dict[str, Any]]) -> None:
|
|
29
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
with path.open("w", encoding="utf-8") as f:
|
|
31
|
+
json.dump(trials, f, indent=2, ensure_ascii=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _write_json_payload(path: Path, payload: Dict[str, Any]) -> None:
|
|
35
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
36
|
+
with path.open("w", encoding="utf-8") as f:
|
|
37
|
+
json.dump(payload, f, indent=2, ensure_ascii=True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _write_csv(path: Path, trials: List[Dict[str, Any]]) -> None:
|
|
41
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
if not trials:
|
|
43
|
+
path.write_text("", encoding="utf-8")
|
|
44
|
+
return
|
|
45
|
+
fieldnames = sorted({key for trial in trials for key in trial.keys()})
|
|
46
|
+
with path.open("w", newline="", encoding="utf-8") as f:
|
|
47
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
48
|
+
writer.writeheader()
|
|
49
|
+
writer.writerows(trials)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _change_to_dict(change: Any) -> Dict[str, Any]:
|
|
53
|
+
return {
|
|
54
|
+
"nct_id": change.nct_id,
|
|
55
|
+
"change_status": change.change_status,
|
|
56
|
+
"changed_fields": change.changed_fields,
|
|
57
|
+
"before": change.before,
|
|
58
|
+
"after": change.after,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@click.command()
|
|
63
|
+
@click.option("--query", required=True, help="Search query for clinical trials.")
|
|
64
|
+
@click.option(
|
|
65
|
+
"--db-path",
|
|
66
|
+
default=get_settings().db_path,
|
|
67
|
+
show_default=True,
|
|
68
|
+
help="Path to the SQLite database.",
|
|
69
|
+
)
|
|
70
|
+
@click.option(
|
|
71
|
+
"--sink",
|
|
72
|
+
default=get_settings().sink,
|
|
73
|
+
show_default=True,
|
|
74
|
+
type=click.Choice(["sqlite"], case_sensitive=False),
|
|
75
|
+
help="Storage backend for persisted trials.",
|
|
76
|
+
)
|
|
77
|
+
@click.option(
|
|
78
|
+
"--log-level",
|
|
79
|
+
default=get_settings().log_level,
|
|
80
|
+
show_default=True,
|
|
81
|
+
type=click.Choice(
|
|
82
|
+
["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False
|
|
83
|
+
),
|
|
84
|
+
help="Logging verbosity.",
|
|
85
|
+
)
|
|
86
|
+
@click.option(
|
|
87
|
+
"--max-results",
|
|
88
|
+
default=get_settings().max_results,
|
|
89
|
+
show_default=True,
|
|
90
|
+
type=int,
|
|
91
|
+
help="Maximum number of trials to fetch.",
|
|
92
|
+
)
|
|
93
|
+
@click.option(
|
|
94
|
+
"--output-path",
|
|
95
|
+
default=get_settings().output_path,
|
|
96
|
+
help="Optional path to write results (JSON or CSV).",
|
|
97
|
+
)
|
|
98
|
+
@click.option(
|
|
99
|
+
"--output-format",
|
|
100
|
+
default=get_settings().output_format,
|
|
101
|
+
show_default=True,
|
|
102
|
+
type=click.Choice(["json", "csv"], case_sensitive=False),
|
|
103
|
+
help="Output format when --output-path is provided.",
|
|
104
|
+
)
|
|
105
|
+
@click.option(
|
|
106
|
+
"--summary-path",
|
|
107
|
+
default=get_settings().summary_path,
|
|
108
|
+
help="Optional path to write summary (JSON).",
|
|
109
|
+
)
|
|
110
|
+
@click.option(
|
|
111
|
+
"--changes-path",
|
|
112
|
+
default=get_settings().changes_path,
|
|
113
|
+
help="Optional path to write change summary (JSON).",
|
|
114
|
+
)
|
|
115
|
+
@click.option(
|
|
116
|
+
"--since-last-run",
|
|
117
|
+
is_flag=True,
|
|
118
|
+
help="When set, change summary uses diffs since the previous run.",
|
|
119
|
+
)
|
|
120
|
+
def run(
|
|
121
|
+
query: str,
|
|
122
|
+
db_path: str,
|
|
123
|
+
sink: str,
|
|
124
|
+
log_level: str,
|
|
125
|
+
max_results: int,
|
|
126
|
+
output_path: Optional[str],
|
|
127
|
+
output_format: str,
|
|
128
|
+
summary_path: Optional[str],
|
|
129
|
+
changes_path: Optional[str],
|
|
130
|
+
since_last_run: bool,
|
|
131
|
+
) -> None:
|
|
132
|
+
"""
|
|
133
|
+
Run the SynthNeura pipeline to fetch, normalize, and store clinical trials.
|
|
134
|
+
"""
|
|
135
|
+
# Set runtime log level for this run
|
|
136
|
+
set_log_level(log_level)
|
|
137
|
+
logger.info("Starting SynthNeura CLI run")
|
|
138
|
+
logger.info(
|
|
139
|
+
"query=%s sink=%s log_level=%s",
|
|
140
|
+
query,
|
|
141
|
+
sink.lower(),
|
|
142
|
+
log_level.upper(),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
click.echo(f"Fetching trials for query: {query}")
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
raw_trials = fetch_trials(query, max_results=max_results)
|
|
149
|
+
|
|
150
|
+
if not raw_trials:
|
|
151
|
+
logger.warning("No trials returned from fetch_trials for query=%s", query)
|
|
152
|
+
click.echo(
|
|
153
|
+
f"No trials found for the query: {query}."
|
|
154
|
+
f"Try a broader or different query."
|
|
155
|
+
)
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
click.echo(f"Fetched {len(raw_trials)} trials.")
|
|
159
|
+
logger.info("Fetched %d raw trials", len(raw_trials))
|
|
160
|
+
|
|
161
|
+
stored = 0
|
|
162
|
+
run_id = uuid4().hex
|
|
163
|
+
ingested_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
164
|
+
run_meta = {
|
|
165
|
+
"run_id": run_id,
|
|
166
|
+
"ingested_at": ingested_at,
|
|
167
|
+
"source": "clinicaltrials.gov",
|
|
168
|
+
"query": query,
|
|
169
|
+
}
|
|
170
|
+
normalized_trials: List[Dict[str, Any]] = []
|
|
171
|
+
trial_models = []
|
|
172
|
+
for raw_trial in raw_trials:
|
|
173
|
+
trial = normalize_trial(raw_trial)
|
|
174
|
+
trial_models.append(trial)
|
|
175
|
+
normalized_trials.append(_trial_to_dict(trial))
|
|
176
|
+
|
|
177
|
+
# User-friendly output
|
|
178
|
+
click.echo(
|
|
179
|
+
f"- {trial.nct_id} | "
|
|
180
|
+
f"{trial.status or 'UNKNOWN'} | "
|
|
181
|
+
f"{trial.phase or 'NA'} | "
|
|
182
|
+
f"{(trial.title or '')[:80]}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if sink.lower() == "sqlite":
|
|
186
|
+
sink_impl = SQLiteTrialSink(db_path)
|
|
187
|
+
else:
|
|
188
|
+
raise ValueError(f"Unsupported sink: {sink}")
|
|
189
|
+
|
|
190
|
+
store_result = sink_impl.store_trials(trial_models, raw_trials, run_meta)
|
|
191
|
+
change_counts = store_result.counts
|
|
192
|
+
stored = len(trial_models)
|
|
193
|
+
|
|
194
|
+
logger.info(
|
|
195
|
+
"Completed run. stored=%d new=%d updated=%d unchanged=%d",
|
|
196
|
+
stored,
|
|
197
|
+
change_counts["new"],
|
|
198
|
+
change_counts["updated"],
|
|
199
|
+
change_counts["unchanged"],
|
|
200
|
+
)
|
|
201
|
+
click.echo("Pipeline completed successfully.")
|
|
202
|
+
|
|
203
|
+
summary = summarize_trials(
|
|
204
|
+
normalized_trials, run_meta=run_meta, change_counts=change_counts
|
|
205
|
+
)
|
|
206
|
+
summary_dict = _trial_to_dict(summary)
|
|
207
|
+
|
|
208
|
+
click.echo(
|
|
209
|
+
f"Summary: total={summary.total_trials} "
|
|
210
|
+
f"status_count={len(summary.status_counts)} "
|
|
211
|
+
f"phase_count={len(summary.phase_counts)} "
|
|
212
|
+
f"new={change_counts['new']} "
|
|
213
|
+
f"updated={change_counts['updated']} "
|
|
214
|
+
f"unchanged={change_counts['unchanged']}"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
if output_path:
|
|
218
|
+
path = Path(output_path)
|
|
219
|
+
if output_format.lower() == "json":
|
|
220
|
+
_write_json_payload(
|
|
221
|
+
path, {"summary": summary_dict, "trials": normalized_trials}
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
_write_csv(path, normalized_trials)
|
|
225
|
+
logger.info("Wrote output %s to %s", output_format.lower(), path)
|
|
226
|
+
click.echo(f"Wrote {output_format.lower()} output to {path}")
|
|
227
|
+
|
|
228
|
+
if summary_path:
|
|
229
|
+
summary_file = Path(summary_path)
|
|
230
|
+
elif output_path and output_format.lower() == "csv":
|
|
231
|
+
summary_file = Path(f"{output_path}.summary.json")
|
|
232
|
+
else:
|
|
233
|
+
summary_file = None
|
|
234
|
+
|
|
235
|
+
if summary_file:
|
|
236
|
+
_write_json_payload(summary_file, summary_dict)
|
|
237
|
+
logger.info("Wrote summary to %s", summary_file)
|
|
238
|
+
click.echo(f"Wrote summary to {summary_file}")
|
|
239
|
+
|
|
240
|
+
if since_last_run and sink.lower() == "sqlite":
|
|
241
|
+
previous_time = sink_impl.fetch_previous_run_time(run_id)
|
|
242
|
+
change_records = [
|
|
243
|
+
_change_to_dict(change)
|
|
244
|
+
for change in sink_impl.fetch_diffs_between(
|
|
245
|
+
previous_time, run_meta["ingested_at"]
|
|
246
|
+
)
|
|
247
|
+
]
|
|
248
|
+
else:
|
|
249
|
+
change_records = [
|
|
250
|
+
_change_to_dict(change) for change in store_result.changes
|
|
251
|
+
]
|
|
252
|
+
change_records.sort(
|
|
253
|
+
key=lambda record: len(record.get("changed_fields", [])), reverse=True
|
|
254
|
+
)
|
|
255
|
+
changes_payload = {
|
|
256
|
+
"run": run_meta,
|
|
257
|
+
"changes": change_records,
|
|
258
|
+
"top_changed": change_records[:5],
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if changes_path:
|
|
262
|
+
changes_file = Path(changes_path)
|
|
263
|
+
elif output_path:
|
|
264
|
+
changes_file = Path(f"{output_path}.changes.json")
|
|
265
|
+
else:
|
|
266
|
+
changes_file = None
|
|
267
|
+
|
|
268
|
+
if changes_file:
|
|
269
|
+
_write_json_payload(changes_file, changes_payload)
|
|
270
|
+
logger.info("Wrote change summary to %s", changes_file)
|
|
271
|
+
click.echo(f"Wrote change summary to {changes_file}")
|
|
272
|
+
|
|
273
|
+
except ValueError as e:
|
|
274
|
+
logger.warning("ValueError during run: %s", e)
|
|
275
|
+
click.echo(f"Error: {e}")
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.exception("Unexpected error during run")
|
|
278
|
+
click.echo(f"An unexpected error occurred: {e}")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
if __name__ == "__main__":
|
|
282
|
+
run()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
synthneura/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
synthneura/core/config.py,sha256=0dlVnYPlnFJEZRmrHaYLV1_zr2571zBeDJgW3Oyer20,1229
|
|
3
|
+
synthneura/core/logger.py,sha256=l5233qwvz8MjV_ezdstUJPjFR2PiuxidilMQ83YaNmk,1223
|
|
4
|
+
synthneura/core/schemas.py,sha256=giYuEq8Zf5zsov0IzQ3ktRTlz9_8cJHjtvVRdPmgpdo,1336
|
|
5
|
+
synthneura/ingestion/clinical_trials.py,sha256=of408fbDDYjV2mdGhm0afFm2R69yZ6m3sHFIKMbv1VY,2146
|
|
6
|
+
synthneura/services/pipeline.py,sha256=Yes3Ld4imzLhGa2MXXhn1jUATlZ3UhxMRsn8cT6Zd58,4551
|
|
7
|
+
synthneura/services/summary.py,sha256=LFyiipU7wa-0EEatjU_ajBZizaP51gfwyguAeqe1Db4,1936
|
|
8
|
+
synthneura/storage/__init__.py,sha256=q_9hlFMVlOjgUcyEKtSiCIFP96Wi1-IlTm_32hl8VIs,250
|
|
9
|
+
synthneura/storage/base.py,sha256=rc7J3VIOFiNhan0fu2gpdpgaXF8OkceVcqxOed4yzYg,795
|
|
10
|
+
synthneura/storage/sqlite_sink.py,sha256=h0D-6mPpVfjCoiMRan1jtURRK6KxxiaX-cC_lUucXP0,17982
|
|
11
|
+
synthneura/ui/cli.py,sha256=6rVv-kteR1cI9CqXinjmIuZ0E6oMCeWmIB_SiZZCjVk,9000
|
|
12
|
+
synthneura-2.0.0.dist-info/METADATA,sha256=Swx9TFRFvc5VeFJS5prqStIG-1rCr_PcRfwZTWP1nQk,204
|
|
13
|
+
synthneura-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
synthneura-2.0.0.dist-info/top_level.txt,sha256=6XwiS6dxlBMStS2v71SEMv8_Kba31uXwIbkIlgr4-Ic,11
|
|
15
|
+
synthneura-2.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
synthneura
|