synthneura 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
synthneura/__init__.py ADDED
File without changes
@@ -0,0 +1,44 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from functools import lru_cache
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class Settings:
9
+ db_path: str
10
+ sink: str
11
+ log_level: str
12
+ output_path: Optional[str]
13
+ output_format: str
14
+ summary_path: Optional[str]
15
+ changes_path: Optional[str]
16
+ max_results: int
17
+
18
+
19
+ @lru_cache(maxsize=1)
20
+ def get_settings() -> Settings:
21
+ db_path = os.getenv("SYNTHNEURA_DB_PATH", "trials.db")
22
+ sink = os.getenv("SYNTHNEURA_SINK", "sqlite")
23
+ log_level = os.getenv("SYNTHNEURA_LOG_LEVEL", "INFO")
24
+ output_path = os.getenv("SYNTHNEURA_OUTPUT_PATH")
25
+ output_format = os.getenv("SYNTHNEURA_OUTPUT_FORMAT", "json")
26
+ summary_path = os.getenv("SYNTHNEURA_SUMMARY_PATH")
27
+ changes_path = os.getenv("SYNTHNEURA_CHANGES_PATH")
28
+ max_results_raw = os.getenv("SYNTHNEURA_MAX_RESULTS", "10")
29
+
30
+ try:
31
+ max_results = int(max_results_raw)
32
+ except ValueError:
33
+ max_results = 10
34
+
35
+ return Settings(
36
+ db_path=db_path,
37
+ sink=sink,
38
+ log_level=log_level,
39
+ output_path=output_path,
40
+ output_format=output_format,
41
+ summary_path=summary_path,
42
+ changes_path=changes_path,
43
+ max_results=max_results,
44
+ )
@@ -0,0 +1,46 @@
1
+ import logging
2
+ import sys
3
+ from typing import Any
4
+
5
+
6
+ def get_logger(name: str) -> logging.Logger:
7
+ """
8
+ Create or retrieve a configured logger.
9
+ """
10
+ logger = logging.getLogger(name)
11
+
12
+ if logger.handlers:
13
+ # Logger already configured
14
+ return logger
15
+
16
+ # Inherit root level by default so CLI/user config controls verbosity.
17
+ logger.setLevel(logging.NOTSET)
18
+
19
+ handler = logging.StreamHandler(sys.stdout)
20
+ handler.setLevel(logging.NOTSET)
21
+ formatter = logging.Formatter(
22
+ "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
23
+ )
24
+ handler.setFormatter(formatter)
25
+
26
+ logger.addHandler(handler)
27
+ logger.propagate = False
28
+
29
+ return logger
30
+
31
+
32
+ def set_log_level(level: Any) -> None:
33
+ """
34
+ Set the log level for root + existing module loggers.
35
+ """
36
+ if isinstance(level, str):
37
+ resolved = getattr(logging, level.upper(), logging.INFO)
38
+ else:
39
+ resolved = int(level)
40
+
41
+ logging.getLogger().setLevel(resolved)
42
+ for logger in logging.Logger.manager.loggerDict.values():
43
+ if isinstance(logger, logging.Logger):
44
+ logger.setLevel(resolved)
45
+ for handler in logger.handlers:
46
+ handler.setLevel(resolved)
@@ -0,0 +1,47 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ClinicalTrial(BaseModel):
7
+ """
8
+ Canonical internal representation of a clinical trial
9
+ normalized from ClinicalTrials.gov (API v2 or legacy).
10
+ """
11
+
12
+ # Core identifiers
13
+ nct_id: str
14
+
15
+ # Descriptive fields (often missing in real-world data)
16
+ title: Optional[str] = None
17
+ status: Optional[str] = None
18
+ phase: Optional[str] = None
19
+ sponsor: Optional[str] = None
20
+
21
+ # Lists are defaulted to empty to avoid runtime errors
22
+ conditions: List[str] = []
23
+ interventions: List[str] = []
24
+ outcomes: List[str] = []
25
+
26
+
27
+ class TopCount(BaseModel):
28
+ value: str
29
+ count: int
30
+
31
+
32
+ class Summary(BaseModel):
33
+ """
34
+ Aggregated summary statistics for a set of trials.
35
+ """
36
+
37
+ total_trials: int
38
+ status_counts: Dict[str, int] = Field(default_factory=dict)
39
+ phase_counts: Dict[str, int] = Field(default_factory=dict)
40
+ top_conditions: List[TopCount] = Field(default_factory=list)
41
+ top_sponsors: List[TopCount] = Field(default_factory=list)
42
+ top_interventions: List[TopCount] = Field(default_factory=list)
43
+ change_counts: Dict[str, int] = Field(default_factory=dict)
44
+ run_id: Optional[str] = None
45
+ ingested_at: Optional[str] = None
46
+ source: Optional[str] = None
47
+ query: Optional[str] = None
@@ -0,0 +1,73 @@
1
+ from typing import Any, Dict, List
2
+
3
+ import requests
4
+
5
+ from synthneura.core.logger import get_logger
6
+
7
+ CTG_V2_STUDIES_URL = "https://clinicaltrials.gov/api/v2/studies"
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def fetch_trials(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
13
+ """
14
+ Fetch clinical trials from ClinicalTrials.gov API v2.
15
+ Returns a list of study objects (dicts).
16
+ """
17
+ logger.info("Fetching ClinicalTrials.gov trials")
18
+ logger.debug("Query=%s max_results=%s", query, max_results)
19
+
20
+ params: dict[str, str] = {
21
+ "query.term": query,
22
+ "pageSize": str(max_results),
23
+ "format": "json",
24
+ "countTotal": "true",
25
+ }
26
+
27
+ try:
28
+ response = requests.get(
29
+ CTG_V2_STUDIES_URL,
30
+ params=params,
31
+ timeout=30,
32
+ )
33
+ except requests.RequestException as exc:
34
+ logger.exception("HTTP request to ClinicalTrials.gov failed")
35
+ raise RuntimeError("Failed to contact ClinicalTrials.gov") from exc
36
+
37
+ logger.info(
38
+ "ClinicalTrials.gov response status=%s url=%s",
39
+ response.status_code,
40
+ response.url,
41
+ )
42
+
43
+ content_type = response.headers.get("Content-Type", "")
44
+ if "text/html" in content_type.lower():
45
+ logger.error(
46
+ "Received HTML instead of JSON (status=%s). Response preview=%s",
47
+ response.status_code,
48
+ response.text[:200],
49
+ )
50
+ raise ValueError(
51
+ f"ClinicalTrials.gov returned HTML (not JSON). "
52
+ f"Status={response.status_code}. Check endpoint/params."
53
+ )
54
+
55
+ try:
56
+ response.raise_for_status()
57
+ except requests.HTTPError:
58
+ logger.error(
59
+ "HTTP error from ClinicalTrials.gov status=%s body=%s",
60
+ response.status_code,
61
+ response.text[:200],
62
+ )
63
+ raise
64
+
65
+ data = response.json()
66
+ studies = data.get("studies", [])
67
+
68
+ if not studies:
69
+ logger.warning("No trials found for query=%s", query)
70
+ raise ValueError(f"No trials found for query: {query}")
71
+
72
+ logger.info("Fetched %d trials from ClinicalTrials.gov", len(studies))
73
+ return studies
@@ -0,0 +1,141 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from synthneura.core.logger import get_logger
4
+ from synthneura.core.schemas import ClinicalTrial
5
+
6
+ logger = get_logger(__name__)
7
+
8
+
9
+ def _as_list(x: Any) -> List[str]:
10
+ """Coerce strings / lists / missing values into a list[str]."""
11
+ if x is None:
12
+ return []
13
+ if isinstance(x, str):
14
+ return [x]
15
+ if isinstance(x, list):
16
+ out: List[str] = []
17
+ for item in x:
18
+ if item is None:
19
+ continue
20
+ if isinstance(item, str):
21
+ out.append(item)
22
+ elif isinstance(item, dict):
23
+ # Try common name fields
24
+ name = item.get("name") or item.get("measure") or item.get("title")
25
+ if name:
26
+ out.append(str(name))
27
+ else:
28
+ out.append(str(item))
29
+ return out
30
+ if isinstance(x, dict):
31
+ # legacy containers like {"Condition": [...]}
32
+ for key in ("Condition", "Intervention", "PrimaryOutcome"):
33
+ if key in x:
34
+ return _as_list(x.get(key))
35
+ # fallback: stringify dict
36
+ return [str(x)]
37
+ return [str(x)]
38
+
39
+
40
+ def normalize_trial(raw_trial: Dict[str, Any]) -> ClinicalTrial:
41
+ """
42
+ Normalize raw trial data into the ClinicalTrial schema.
43
+ Supports:
44
+ - ClinicalTrials.gov API v2 (preferred)
45
+ - Legacy v1 'FullStudies' shape (fallback)
46
+ """
47
+ # --- Detect v1 wrapper ---
48
+ if "Study" in raw_trial:
49
+ ps = raw_trial["Study"]["ProtocolSection"]
50
+
51
+ nct_id = ps["IdentificationModule"]["NCTId"]
52
+ title = ps["IdentificationModule"].get("BriefTitle") or ps[
53
+ "IdentificationModule"
54
+ ].get("OfficialTitle")
55
+ status = ps.get("StatusModule", {}).get("OverallStatus")
56
+ phase = ps.get("DesignModule", {}).get("Phase")
57
+ sponsor = ps.get("SponsorModule", {}).get("LeadSponsor", {}).get("Name")
58
+
59
+ conditions = _as_list(
60
+ ps.get("ConditionsModule", {}).get("ConditionList", {}).get("Condition")
61
+ )
62
+ interventions = _as_list(
63
+ ps.get("ArmsInterventionsModule", {})
64
+ .get("InterventionList", {})
65
+ .get("Intervention")
66
+ )
67
+ outcomes = _as_list(
68
+ ps.get("OutcomesModule", {})
69
+ .get("PrimaryOutcomeList", {})
70
+ .get("PrimaryOutcome")
71
+ )
72
+
73
+ logger.debug(
74
+ "Normalized v1 trial nct_id=%s title=%s", nct_id, (title or "")[:60]
75
+ )
76
+
77
+ return ClinicalTrial(
78
+ nct_id=nct_id,
79
+ title=title,
80
+ status=status,
81
+ phase=phase,
82
+ sponsor=sponsor,
83
+ conditions=conditions,
84
+ interventions=interventions,
85
+ outcomes=outcomes,
86
+ )
87
+
88
+ # --- v2 path ---
89
+ ps = raw_trial.get("protocolSection", {}) or {}
90
+
91
+ ident = ps.get("identificationModule", {}) or {}
92
+ status_mod = ps.get("statusModule", {}) or {}
93
+ design_mod = ps.get("designModule", {}) or {}
94
+ sponsor_mod = ps.get("sponsorsModule", {}) or {}
95
+ cond_mod = ps.get("conditionsModule", {}) or {}
96
+ arms_mod = ps.get("armsInterventionsModule", {}) or {}
97
+ outcomes_mod = ps.get("outcomesModule", {}) or {}
98
+
99
+ nct_id = ident.get("nctId")
100
+ title = ident.get("briefTitle") or ident.get("officialTitle")
101
+ status = status_mod.get("overallStatus")
102
+
103
+ phases = design_mod.get("phases")
104
+ phase = ", ".join(_as_list(phases)) if phases else None
105
+
106
+ sponsor = (sponsor_mod.get("leadSponsor", {}) or {}).get("name")
107
+
108
+ conditions = _as_list(cond_mod.get("conditions"))
109
+ interventions = _as_list(arms_mod.get("interventions"))
110
+
111
+ primary_outcomes = outcomes_mod.get("primaryOutcomes")
112
+ outcomes = _as_list(primary_outcomes)
113
+
114
+ if not nct_id:
115
+ logger.error(
116
+ "Missing nctId in trial record. Keys=%s",
117
+ list(raw_trial.keys()),
118
+ )
119
+ raise ValueError(
120
+ "Missing nctId in trial record "
121
+ "(unexpected ClinicalTrials.gov response shape)."
122
+ )
123
+
124
+ # Helpful warnings (non-fatal)
125
+ if not title:
126
+ logger.warning("Missing title for nct_id=%s", nct_id)
127
+ if not status:
128
+ logger.warning("Missing status for nct_id=%s", nct_id)
129
+
130
+ logger.debug("Normalized v2 trial nct_id=%s title=%s", nct_id, (title or "")[:60])
131
+
132
+ return ClinicalTrial(
133
+ nct_id=nct_id,
134
+ title=title,
135
+ status=status,
136
+ phase=phase,
137
+ sponsor=sponsor,
138
+ conditions=conditions,
139
+ interventions=interventions,
140
+ outcomes=outcomes,
141
+ )
@@ -0,0 +1,60 @@
1
+ from collections import Counter
2
+ from typing import Any, Dict, Iterable, List, Optional
3
+
4
+ from synthneura.core.schemas import Summary, TopCount
5
+
6
+
7
+ def _as_str_list(values: Iterable[Any]) -> List[str]:
8
+ out: List[str] = []
9
+ for value in values:
10
+ if value is None:
11
+ continue
12
+ if isinstance(value, str):
13
+ if value.strip():
14
+ out.append(value.strip())
15
+ continue
16
+ if isinstance(value, list):
17
+ out.extend(_as_str_list(value))
18
+ continue
19
+ out.append(str(value))
20
+ return out
21
+
22
+
23
+ def _top_counts(values: Iterable[str], limit: int = 5) -> List[TopCount]:
24
+ counts = Counter(values)
25
+ return [
26
+ TopCount(value=value, count=count) for value, count in counts.most_common(limit)
27
+ ]
28
+
29
+
30
+ def summarize_trials(
31
+ trials: List[Dict[str, Any]],
32
+ *,
33
+ run_meta: Optional[Dict[str, str]] = None,
34
+ change_counts: Optional[Dict[str, int]] = None,
35
+ ) -> Summary:
36
+ statuses = _as_str_list(trial.get("status") for trial in trials)
37
+ phases = _as_str_list(trial.get("phase") for trial in trials)
38
+ sponsors = _as_str_list(trial.get("sponsor") for trial in trials)
39
+
40
+ conditions: List[str] = []
41
+ interventions: List[str] = []
42
+ for trial in trials:
43
+ conditions.extend(_as_str_list(trial.get("conditions", [])))
44
+ interventions.extend(_as_str_list(trial.get("interventions", [])))
45
+
46
+ run_meta = run_meta or {}
47
+
48
+ return Summary(
49
+ total_trials=len(trials),
50
+ status_counts=dict(Counter(statuses)),
51
+ phase_counts=dict(Counter(phases)),
52
+ top_conditions=_top_counts(conditions),
53
+ top_sponsors=_top_counts(sponsors),
54
+ top_interventions=_top_counts(interventions),
55
+ change_counts=change_counts or {},
56
+ run_id=run_meta.get("run_id"),
57
+ ingested_at=run_meta.get("ingested_at"),
58
+ source=run_meta.get("source"),
59
+ query=run_meta.get("query"),
60
+ )
@@ -0,0 +1,10 @@
1
+ from synthneura.storage.base import ChangeRecord, RunMeta, StoreResult, TrialSink
2
+ from synthneura.storage.sqlite_sink import SQLiteTrialSink
3
+
4
+ __all__ = [
5
+ "ChangeRecord",
6
+ "RunMeta",
7
+ "StoreResult",
8
+ "TrialSink",
9
+ "SQLiteTrialSink",
10
+ ]
@@ -0,0 +1,36 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any, Dict, List
4
+
5
+ from synthneura.core.schemas import ClinicalTrial
6
+
7
+ RunMeta = Dict[str, str]
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class ChangeRecord:
12
+ nct_id: str
13
+ change_status: str
14
+ changed_fields: List[str]
15
+ before: Dict[str, Any]
16
+ after: Dict[str, Any]
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class StoreResult:
21
+ counts: Dict[str, int]
22
+ changes: List[ChangeRecord]
23
+
24
+
25
+ class TrialSink(ABC):
26
+ @abstractmethod
27
+ def store_trials(
28
+ self,
29
+ trials: List[ClinicalTrial],
30
+ raw_trials: List[Dict[str, Any]],
31
+ run_meta: RunMeta,
32
+ ) -> StoreResult:
33
+ """
34
+ Persist trials and return counts for new/updated/unchanged/failed.
35
+ """
36
+ raise NotImplementedError
@@ -0,0 +1,552 @@
1
+ import json
2
+ import sqlite3
3
+ from datetime import datetime, timezone
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ from synthneura.core.logger import get_logger
7
+ from synthneura.core.schemas import ClinicalTrial
8
+ from synthneura.storage.base import ChangeRecord, RunMeta, StoreResult, TrialSink
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ def _utc_now() -> str:
14
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
15
+
16
+
17
+ def _ensure_schema(conn: sqlite3.Connection) -> None:
18
+ cursor = conn.cursor()
19
+ cursor.execute(
20
+ """
21
+ CREATE TABLE IF NOT EXISTS clinical_trials (
22
+ nct_id TEXT PRIMARY KEY,
23
+ title TEXT,
24
+ status TEXT,
25
+ phase TEXT,
26
+ sponsor TEXT,
27
+ conditions TEXT,
28
+ interventions TEXT,
29
+ outcomes TEXT,
30
+ raw_json TEXT,
31
+ source TEXT,
32
+ query TEXT,
33
+ ingested_at TEXT,
34
+ run_id TEXT
35
+ )
36
+ """
37
+ )
38
+ conn.commit()
39
+
40
+ cursor.execute("PRAGMA table_info(clinical_trials)")
41
+ existing = {row[1] for row in cursor.fetchall()}
42
+ expected = {
43
+ "raw_json",
44
+ "source",
45
+ "query",
46
+ "ingested_at",
47
+ "run_id",
48
+ }
49
+ missing = expected - existing
50
+ for column in sorted(missing):
51
+ cursor.execute(f"ALTER TABLE clinical_trials ADD COLUMN {column} TEXT")
52
+ if missing:
53
+ conn.commit()
54
+
55
+ cursor.execute(
56
+ """
57
+ CREATE TABLE IF NOT EXISTS clinical_trials_history (
58
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
59
+ nct_id TEXT,
60
+ title TEXT,
61
+ status TEXT,
62
+ phase TEXT,
63
+ sponsor TEXT,
64
+ conditions TEXT,
65
+ interventions TEXT,
66
+ outcomes TEXT,
67
+ raw_json TEXT,
68
+ source TEXT,
69
+ query TEXT,
70
+ ingested_at TEXT,
71
+ run_id TEXT,
72
+ valid_from TEXT,
73
+ valid_to TEXT
74
+ )
75
+ """
76
+ )
77
+ conn.commit()
78
+
79
+ cursor.execute("PRAGMA table_info(clinical_trials_history)")
80
+ existing_history = {row[1] for row in cursor.fetchall()}
81
+ expected_history = {
82
+ "nct_id",
83
+ "title",
84
+ "status",
85
+ "phase",
86
+ "sponsor",
87
+ "conditions",
88
+ "interventions",
89
+ "outcomes",
90
+ "raw_json",
91
+ "source",
92
+ "query",
93
+ "ingested_at",
94
+ "run_id",
95
+ "valid_from",
96
+ "valid_to",
97
+ }
98
+ missing_history = expected_history - existing_history
99
+ for column in sorted(missing_history):
100
+ cursor.execute(f"ALTER TABLE clinical_trials_history ADD COLUMN {column} TEXT")
101
+ if missing_history:
102
+ conn.commit()
103
+
104
+ cursor.execute(
105
+ """
106
+ CREATE TABLE IF NOT EXISTS runs (
107
+ run_id TEXT PRIMARY KEY,
108
+ ingested_at TEXT,
109
+ source TEXT,
110
+ query TEXT,
111
+ total INTEGER,
112
+ new_count INTEGER,
113
+ updated_count INTEGER,
114
+ unchanged_count INTEGER,
115
+ failed_count INTEGER
116
+ )
117
+ """
118
+ )
119
+ conn.commit()
120
+
121
+ cursor.execute("PRAGMA table_info(runs)")
122
+ existing_runs = {row[1] for row in cursor.fetchall()}
123
+ expected_runs = {
124
+ "ingested_at",
125
+ "source",
126
+ "query",
127
+ "total",
128
+ "new_count",
129
+ "updated_count",
130
+ "unchanged_count",
131
+ "failed_count",
132
+ }
133
+ missing_runs = expected_runs - existing_runs
134
+ for column in sorted(missing_runs):
135
+ column_type = "INTEGER"
136
+ if column in {"ingested_at", "source", "query"}:
137
+ column_type = "TEXT"
138
+ cursor.execute(f"ALTER TABLE runs ADD COLUMN {column} {column_type}")
139
+ if missing_runs:
140
+ conn.commit()
141
+
142
+ cursor.execute(
143
+ """
144
+ CREATE TABLE IF NOT EXISTS trial_diffs (
145
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
146
+ nct_id TEXT,
147
+ run_id TEXT,
148
+ ingested_at TEXT,
149
+ changed_fields TEXT,
150
+ before_json TEXT,
151
+ after_json TEXT
152
+ )
153
+ """
154
+ )
155
+ conn.commit()
156
+
157
+ cursor.execute("PRAGMA table_info(trial_diffs)")
158
+ existing_diffs = {row[1] for row in cursor.fetchall()}
159
+ expected_diffs = {
160
+ "nct_id",
161
+ "run_id",
162
+ "ingested_at",
163
+ "changed_fields",
164
+ "before_json",
165
+ "after_json",
166
+ }
167
+ missing_diffs = expected_diffs - existing_diffs
168
+ for column in sorted(missing_diffs):
169
+ cursor.execute(f"ALTER TABLE trial_diffs ADD COLUMN {column} TEXT")
170
+ if missing_diffs:
171
+ conn.commit()
172
+
173
+ cursor.execute(
174
+ """
175
+ CREATE INDEX IF NOT EXISTS idx_clinical_trials_nct_id
176
+ ON clinical_trials(nct_id)
177
+ """
178
+ )
179
+ cursor.execute(
180
+ """
181
+ CREATE INDEX IF NOT EXISTS idx_clinical_trials_history_nct_id_valid_from
182
+ ON clinical_trials_history(nct_id, valid_from)
183
+ """
184
+ )
185
+ cursor.execute(
186
+ """
187
+ CREATE INDEX IF NOT EXISTS idx_trial_diffs_nct_id_ingested_at
188
+ ON trial_diffs(nct_id, ingested_at)
189
+ """
190
+ )
191
+ conn.commit()
192
+
193
+
194
+ def _trial_core_values(trial: ClinicalTrial) -> Tuple[Optional[str], ...]:
195
+ return (
196
+ trial.title,
197
+ trial.status,
198
+ trial.phase,
199
+ trial.sponsor,
200
+ ",".join(trial.conditions or []),
201
+ ",".join(trial.interventions or []),
202
+ ",".join(trial.outcomes or []),
203
+ )
204
+
205
+
206
+ def _core_to_dict(core: Tuple[Optional[str], ...]) -> Dict[str, Any]:
207
+ return {
208
+ "title": core[0],
209
+ "status": core[1],
210
+ "phase": core[2],
211
+ "sponsor": core[3],
212
+ "conditions": core[4],
213
+ "interventions": core[5],
214
+ "outcomes": core[6],
215
+ }
216
+
217
+
218
+ def _changed_fields(
219
+ existing: Tuple[Optional[str], ...], updated: Tuple[Optional[str], ...]
220
+ ) -> List[str]:
221
+ fields = [
222
+ "title",
223
+ "status",
224
+ "phase",
225
+ "sponsor",
226
+ "conditions",
227
+ "interventions",
228
+ "outcomes",
229
+ ]
230
+ changed: List[str] = []
231
+ for idx, field in enumerate(fields):
232
+ existing_value = existing[idx] or ""
233
+ updated_value = updated[idx] or ""
234
+ if existing_value != updated_value:
235
+ changed.append(field)
236
+ return changed
237
+
238
+
239
+ def _raw_nct_id(raw_trial: Dict[str, Any]) -> Optional[str]:
240
+ if "Study" in raw_trial:
241
+ return (
242
+ raw_trial.get("Study", {})
243
+ .get("ProtocolSection", {})
244
+ .get("IdentificationModule", {})
245
+ .get("NCTId")
246
+ )
247
+ return (
248
+ raw_trial.get("protocolSection", {})
249
+ .get("identificationModule", {})
250
+ .get("nctId")
251
+ )
252
+
253
+
254
+ class SQLiteTrialSink(TrialSink):
255
+ def __init__(self, db_path: str = "trials.db") -> None:
256
+ self.db_path = db_path
257
+
258
+ def fetch_previous_run_time(self, current_run_id: str) -> Optional[str]:
259
+ conn = sqlite3.connect(self.db_path)
260
+ try:
261
+ _ensure_schema(conn)
262
+ cursor = conn.cursor()
263
+ cursor.execute(
264
+ """
265
+ SELECT ingested_at
266
+ FROM runs
267
+ WHERE run_id != ?
268
+ ORDER BY ingested_at DESC
269
+ LIMIT 1
270
+ """,
271
+ (current_run_id,),
272
+ )
273
+ row = cursor.fetchone()
274
+ finally:
275
+ conn.close()
276
+ if row:
277
+ return row[0]
278
+ return None
279
+
280
+ def fetch_diffs_between(
281
+ self, start_time: Optional[str], end_time: Optional[str]
282
+ ) -> List[ChangeRecord]:
283
+ if not end_time:
284
+ return []
285
+ conn = sqlite3.connect(self.db_path)
286
+ try:
287
+ _ensure_schema(conn)
288
+ cursor = conn.cursor()
289
+ if start_time:
290
+ cursor.execute(
291
+ """
292
+ SELECT nct_id, changed_fields, before_json, after_json
293
+ FROM trial_diffs
294
+ WHERE ingested_at > ? AND ingested_at <= ?
295
+ ORDER BY ingested_at DESC
296
+ """,
297
+ (start_time, end_time),
298
+ )
299
+ else:
300
+ cursor.execute(
301
+ """
302
+ SELECT nct_id, changed_fields, before_json, after_json
303
+ FROM trial_diffs
304
+ WHERE ingested_at <= ?
305
+ ORDER BY ingested_at DESC
306
+ """,
307
+ (end_time,),
308
+ )
309
+ rows = cursor.fetchall()
310
+ finally:
311
+ conn.close()
312
+
313
+ changes: List[ChangeRecord] = []
314
+ for row in rows:
315
+ nct_id, changed_fields_json, before_json, after_json = row
316
+ changes.append(
317
+ ChangeRecord(
318
+ nct_id=nct_id,
319
+ change_status="updated",
320
+ changed_fields=json.loads(changed_fields_json or "[]"),
321
+ before=json.loads(before_json or "{}"),
322
+ after=json.loads(after_json or "{}"),
323
+ )
324
+ )
325
+ return changes
326
+
327
+ def store_trials(
328
+ self,
329
+ trials: List[ClinicalTrial],
330
+ raw_trials: List[Dict[str, Any]],
331
+ run_meta: RunMeta,
332
+ ) -> StoreResult:
333
+ logger.debug("Storing %d trials in SQLite db=%s", len(trials), self.db_path)
334
+
335
+ raw_by_id = {}
336
+ for raw_trial in raw_trials:
337
+ nct_id = _raw_nct_id(raw_trial)
338
+ if nct_id:
339
+ raw_by_id[nct_id] = raw_trial
340
+
341
+ counts = {"new": 0, "updated": 0, "unchanged": 0, "failed": 0}
342
+ changes: List[ChangeRecord] = []
343
+ conn = sqlite3.connect(self.db_path)
344
+ try:
345
+ _ensure_schema(conn)
346
+ cursor = conn.cursor()
347
+ for trial in trials:
348
+ cursor.execute(
349
+ """
350
+ SELECT
351
+ title,
352
+ status,
353
+ phase,
354
+ sponsor,
355
+ conditions,
356
+ interventions,
357
+ outcomes,
358
+ raw_json,
359
+ source,
360
+ query,
361
+ ingested_at,
362
+ run_id
363
+ FROM clinical_trials
364
+ WHERE nct_id = ?
365
+ """,
366
+ (trial.nct_id,),
367
+ )
368
+ existing = cursor.fetchone()
369
+ existing_core = existing[:7] if existing else None
370
+ existing_raw_json = existing[7] if existing else None
371
+ existing_source = existing[8] if existing else None
372
+ existing_query = existing[9] if existing else None
373
+ existing_ingested_at = existing[10] if existing else None
374
+ existing_run_id = existing[11] if existing else None
375
+ new_core = _trial_core_values(trial)
376
+
377
+ if existing is None:
378
+ change_status = "new"
379
+ elif existing_core == new_core:
380
+ change_status = "unchanged"
381
+ else:
382
+ change_status = "updated"
383
+
384
+ ingested_at_value = run_meta.get("ingested_at") or _utc_now()
385
+ if change_status == "unchanged" and existing_ingested_at:
386
+ ingested_at_value = existing_ingested_at
387
+ raw_json = json.dumps(
388
+ raw_by_id.get(trial.nct_id, {}), ensure_ascii=True
389
+ )
390
+ if trial.nct_id not in raw_by_id and existing_raw_json is not None:
391
+ raw_json = existing_raw_json
392
+ if change_status == "updated" and existing is not None:
393
+ changed_fields = _changed_fields(
394
+ existing_core or new_core, new_core
395
+ )
396
+ before = _core_to_dict(existing_core or new_core)
397
+ after = _core_to_dict(new_core)
398
+ changes.append(
399
+ ChangeRecord(
400
+ nct_id=trial.nct_id,
401
+ change_status=change_status,
402
+ changed_fields=changed_fields,
403
+ before=before,
404
+ after=after,
405
+ )
406
+ )
407
+ cursor.execute(
408
+ """
409
+ INSERT INTO trial_diffs
410
+ (
411
+ nct_id,
412
+ run_id,
413
+ ingested_at,
414
+ changed_fields,
415
+ before_json,
416
+ after_json
417
+ )
418
+ VALUES (?, ?, ?, ?, ?, ?)
419
+ """,
420
+ (
421
+ trial.nct_id,
422
+ run_meta.get("run_id"),
423
+ ingested_at_value,
424
+ json.dumps(changed_fields, ensure_ascii=True),
425
+ json.dumps(before, ensure_ascii=True),
426
+ json.dumps(after, ensure_ascii=True),
427
+ ),
428
+ )
429
+ (
430
+ existing_title,
431
+ existing_status,
432
+ existing_phase,
433
+ existing_sponsor,
434
+ existing_conditions,
435
+ existing_interventions,
436
+ existing_outcomes,
437
+ ) = (
438
+ existing_core or new_core
439
+ )
440
+ cursor.execute(
441
+ """
442
+ INSERT INTO clinical_trials_history
443
+ (
444
+ nct_id,
445
+ title,
446
+ status,
447
+ phase,
448
+ sponsor,
449
+ conditions,
450
+ interventions,
451
+ outcomes,
452
+ raw_json,
453
+ source,
454
+ query,
455
+ ingested_at,
456
+ run_id,
457
+ valid_from,
458
+ valid_to
459
+ )
460
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
461
+ """,
462
+ (
463
+ trial.nct_id,
464
+ existing_title,
465
+ existing_status,
466
+ existing_phase,
467
+ existing_sponsor,
468
+ existing_conditions,
469
+ existing_interventions,
470
+ existing_outcomes,
471
+ existing_raw_json,
472
+ existing_source,
473
+ existing_query,
474
+ existing_ingested_at,
475
+ existing_run_id,
476
+ existing_ingested_at,
477
+ ingested_at_value,
478
+ ),
479
+ )
480
+ if change_status != "unchanged":
481
+ cursor.execute(
482
+ """
483
+ INSERT OR REPLACE INTO clinical_trials
484
+ (
485
+ nct_id,
486
+ title,
487
+ status,
488
+ phase,
489
+ sponsor,
490
+ conditions,
491
+ interventions,
492
+ outcomes,
493
+ raw_json,
494
+ source,
495
+ query,
496
+ ingested_at,
497
+ run_id
498
+ )
499
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
500
+ """,
501
+ (
502
+ trial.nct_id,
503
+ trial.title,
504
+ trial.status,
505
+ trial.phase,
506
+ trial.sponsor,
507
+ ",".join(trial.conditions or []),
508
+ ",".join(trial.interventions or []),
509
+ ",".join(trial.outcomes or []),
510
+ raw_json,
511
+ run_meta.get("source"),
512
+ run_meta.get("query"),
513
+ ingested_at_value,
514
+ run_meta.get("run_id"),
515
+ ),
516
+ )
517
+ counts[change_status] += 1
518
+ conn.commit()
519
+ run_id = run_meta.get("run_id")
520
+ if run_id:
521
+ cursor.execute(
522
+ """
523
+ INSERT OR REPLACE INTO runs
524
+ (
525
+ run_id,
526
+ ingested_at,
527
+ source,
528
+ query,
529
+ total,
530
+ new_count,
531
+ updated_count,
532
+ unchanged_count,
533
+ failed_count
534
+ )
535
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
536
+ """,
537
+ (
538
+ run_id,
539
+ run_meta.get("ingested_at"),
540
+ run_meta.get("source"),
541
+ run_meta.get("query"),
542
+ len(trials),
543
+ counts["new"],
544
+ counts["updated"],
545
+ counts["unchanged"],
546
+ counts["failed"],
547
+ ),
548
+ )
549
+ conn.commit()
550
+ finally:
551
+ conn.close()
552
+ return StoreResult(counts=counts, changes=changes)
synthneura/ui/cli.py ADDED
@@ -0,0 +1,282 @@
1
+ import csv
2
+ import json
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+ from uuid import uuid4
7
+
8
+ import click
9
+
10
+ from synthneura.core.config import get_settings
11
+ from synthneura.core.logger import get_logger, set_log_level
12
+ from synthneura.ingestion.clinical_trials import fetch_trials
13
+ from synthneura.services.pipeline import normalize_trial
14
+ from synthneura.services.summary import summarize_trials
15
+ from synthneura.storage.sqlite_sink import SQLiteTrialSink
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ def _trial_to_dict(trial: Any) -> Dict[str, Any]:
21
+ if hasattr(trial, "model_dump"):
22
+ return trial.model_dump()
23
+ if hasattr(trial, "dict"):
24
+ return trial.dict()
25
+ return dict(trial)
26
+
27
+
28
+ def _write_json(path: Path, trials: List[Dict[str, Any]]) -> None:
29
+ path.parent.mkdir(parents=True, exist_ok=True)
30
+ with path.open("w", encoding="utf-8") as f:
31
+ json.dump(trials, f, indent=2, ensure_ascii=True)
32
+
33
+
34
+ def _write_json_payload(path: Path, payload: Dict[str, Any]) -> None:
35
+ path.parent.mkdir(parents=True, exist_ok=True)
36
+ with path.open("w", encoding="utf-8") as f:
37
+ json.dump(payload, f, indent=2, ensure_ascii=True)
38
+
39
+
40
+ def _write_csv(path: Path, trials: List[Dict[str, Any]]) -> None:
41
+ path.parent.mkdir(parents=True, exist_ok=True)
42
+ if not trials:
43
+ path.write_text("", encoding="utf-8")
44
+ return
45
+ fieldnames = sorted({key for trial in trials for key in trial.keys()})
46
+ with path.open("w", newline="", encoding="utf-8") as f:
47
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
48
+ writer.writeheader()
49
+ writer.writerows(trials)
50
+
51
+
52
+ def _change_to_dict(change: Any) -> Dict[str, Any]:
53
+ return {
54
+ "nct_id": change.nct_id,
55
+ "change_status": change.change_status,
56
+ "changed_fields": change.changed_fields,
57
+ "before": change.before,
58
+ "after": change.after,
59
+ }
60
+
61
+
62
+ @click.command()
63
+ @click.option("--query", required=True, help="Search query for clinical trials.")
64
+ @click.option(
65
+ "--db-path",
66
+ default=get_settings().db_path,
67
+ show_default=True,
68
+ help="Path to the SQLite database.",
69
+ )
70
+ @click.option(
71
+ "--sink",
72
+ default=get_settings().sink,
73
+ show_default=True,
74
+ type=click.Choice(["sqlite"], case_sensitive=False),
75
+ help="Storage backend for persisted trials.",
76
+ )
77
+ @click.option(
78
+ "--log-level",
79
+ default=get_settings().log_level,
80
+ show_default=True,
81
+ type=click.Choice(
82
+ ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False
83
+ ),
84
+ help="Logging verbosity.",
85
+ )
86
+ @click.option(
87
+ "--max-results",
88
+ default=get_settings().max_results,
89
+ show_default=True,
90
+ type=int,
91
+ help="Maximum number of trials to fetch.",
92
+ )
93
+ @click.option(
94
+ "--output-path",
95
+ default=get_settings().output_path,
96
+ help="Optional path to write results (JSON or CSV).",
97
+ )
98
+ @click.option(
99
+ "--output-format",
100
+ default=get_settings().output_format,
101
+ show_default=True,
102
+ type=click.Choice(["json", "csv"], case_sensitive=False),
103
+ help="Output format when --output-path is provided.",
104
+ )
105
+ @click.option(
106
+ "--summary-path",
107
+ default=get_settings().summary_path,
108
+ help="Optional path to write summary (JSON).",
109
+ )
110
+ @click.option(
111
+ "--changes-path",
112
+ default=get_settings().changes_path,
113
+ help="Optional path to write change summary (JSON).",
114
+ )
115
+ @click.option(
116
+ "--since-last-run",
117
+ is_flag=True,
118
+ help="When set, change summary uses diffs since the previous run.",
119
+ )
120
+ def run(
121
+ query: str,
122
+ db_path: str,
123
+ sink: str,
124
+ log_level: str,
125
+ max_results: int,
126
+ output_path: Optional[str],
127
+ output_format: str,
128
+ summary_path: Optional[str],
129
+ changes_path: Optional[str],
130
+ since_last_run: bool,
131
+ ) -> None:
132
+ """
133
+ Run the SynthNeura pipeline to fetch, normalize, and store clinical trials.
134
+ """
135
+ # Set runtime log level for this run
136
+ set_log_level(log_level)
137
+ logger.info("Starting SynthNeura CLI run")
138
+ logger.info(
139
+ "query=%s sink=%s log_level=%s",
140
+ query,
141
+ sink.lower(),
142
+ log_level.upper(),
143
+ )
144
+
145
+ click.echo(f"Fetching trials for query: {query}")
146
+
147
+ try:
148
+ raw_trials = fetch_trials(query, max_results=max_results)
149
+
150
+ if not raw_trials:
151
+ logger.warning("No trials returned from fetch_trials for query=%s", query)
152
+ click.echo(
153
+ f"No trials found for the query: {query}."
154
+ f"Try a broader or different query."
155
+ )
156
+ return
157
+
158
+ click.echo(f"Fetched {len(raw_trials)} trials.")
159
+ logger.info("Fetched %d raw trials", len(raw_trials))
160
+
161
+ stored = 0
162
+ run_id = uuid4().hex
163
+ ingested_at = datetime.now(timezone.utc).isoformat(timespec="seconds")
164
+ run_meta = {
165
+ "run_id": run_id,
166
+ "ingested_at": ingested_at,
167
+ "source": "clinicaltrials.gov",
168
+ "query": query,
169
+ }
170
+ normalized_trials: List[Dict[str, Any]] = []
171
+ trial_models = []
172
+ for raw_trial in raw_trials:
173
+ trial = normalize_trial(raw_trial)
174
+ trial_models.append(trial)
175
+ normalized_trials.append(_trial_to_dict(trial))
176
+
177
+ # User-friendly output
178
+ click.echo(
179
+ f"- {trial.nct_id} | "
180
+ f"{trial.status or 'UNKNOWN'} | "
181
+ f"{trial.phase or 'NA'} | "
182
+ f"{(trial.title or '')[:80]}"
183
+ )
184
+
185
+ if sink.lower() == "sqlite":
186
+ sink_impl = SQLiteTrialSink(db_path)
187
+ else:
188
+ raise ValueError(f"Unsupported sink: {sink}")
189
+
190
+ store_result = sink_impl.store_trials(trial_models, raw_trials, run_meta)
191
+ change_counts = store_result.counts
192
+ stored = len(trial_models)
193
+
194
+ logger.info(
195
+ "Completed run. stored=%d new=%d updated=%d unchanged=%d",
196
+ stored,
197
+ change_counts["new"],
198
+ change_counts["updated"],
199
+ change_counts["unchanged"],
200
+ )
201
+ click.echo("Pipeline completed successfully.")
202
+
203
+ summary = summarize_trials(
204
+ normalized_trials, run_meta=run_meta, change_counts=change_counts
205
+ )
206
+ summary_dict = _trial_to_dict(summary)
207
+
208
+ click.echo(
209
+ f"Summary: total={summary.total_trials} "
210
+ f"status_count={len(summary.status_counts)} "
211
+ f"phase_count={len(summary.phase_counts)} "
212
+ f"new={change_counts['new']} "
213
+ f"updated={change_counts['updated']} "
214
+ f"unchanged={change_counts['unchanged']}"
215
+ )
216
+
217
+ if output_path:
218
+ path = Path(output_path)
219
+ if output_format.lower() == "json":
220
+ _write_json_payload(
221
+ path, {"summary": summary_dict, "trials": normalized_trials}
222
+ )
223
+ else:
224
+ _write_csv(path, normalized_trials)
225
+ logger.info("Wrote output %s to %s", output_format.lower(), path)
226
+ click.echo(f"Wrote {output_format.lower()} output to {path}")
227
+
228
+ if summary_path:
229
+ summary_file = Path(summary_path)
230
+ elif output_path and output_format.lower() == "csv":
231
+ summary_file = Path(f"{output_path}.summary.json")
232
+ else:
233
+ summary_file = None
234
+
235
+ if summary_file:
236
+ _write_json_payload(summary_file, summary_dict)
237
+ logger.info("Wrote summary to %s", summary_file)
238
+ click.echo(f"Wrote summary to {summary_file}")
239
+
240
+ if since_last_run and sink.lower() == "sqlite":
241
+ previous_time = sink_impl.fetch_previous_run_time(run_id)
242
+ change_records = [
243
+ _change_to_dict(change)
244
+ for change in sink_impl.fetch_diffs_between(
245
+ previous_time, run_meta["ingested_at"]
246
+ )
247
+ ]
248
+ else:
249
+ change_records = [
250
+ _change_to_dict(change) for change in store_result.changes
251
+ ]
252
+ change_records.sort(
253
+ key=lambda record: len(record.get("changed_fields", [])), reverse=True
254
+ )
255
+ changes_payload = {
256
+ "run": run_meta,
257
+ "changes": change_records,
258
+ "top_changed": change_records[:5],
259
+ }
260
+
261
+ if changes_path:
262
+ changes_file = Path(changes_path)
263
+ elif output_path:
264
+ changes_file = Path(f"{output_path}.changes.json")
265
+ else:
266
+ changes_file = None
267
+
268
+ if changes_file:
269
+ _write_json_payload(changes_file, changes_payload)
270
+ logger.info("Wrote change summary to %s", changes_file)
271
+ click.echo(f"Wrote change summary to {changes_file}")
272
+
273
+ except ValueError as e:
274
+ logger.warning("ValueError during run: %s", e)
275
+ click.echo(f"Error: {e}")
276
+ except Exception as e:
277
+ logger.exception("Unexpected error during run")
278
+ click.echo(f"An unexpected error occurred: {e}")
279
+
280
+
281
+ if __name__ == "__main__":
282
+ run()
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: synthneura
3
+ Version: 2.0.0
4
+ Requires-Dist: click>=8.1
5
+ Requires-Dist: pydantic<3,>=1.10
6
+ Requires-Dist: requests>=2.31
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=7; extra == "dev"
@@ -0,0 +1,15 @@
1
+ synthneura/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ synthneura/core/config.py,sha256=0dlVnYPlnFJEZRmrHaYLV1_zr2571zBeDJgW3Oyer20,1229
3
+ synthneura/core/logger.py,sha256=l5233qwvz8MjV_ezdstUJPjFR2PiuxidilMQ83YaNmk,1223
4
+ synthneura/core/schemas.py,sha256=giYuEq8Zf5zsov0IzQ3ktRTlz9_8cJHjtvVRdPmgpdo,1336
5
+ synthneura/ingestion/clinical_trials.py,sha256=of408fbDDYjV2mdGhm0afFm2R69yZ6m3sHFIKMbv1VY,2146
6
+ synthneura/services/pipeline.py,sha256=Yes3Ld4imzLhGa2MXXhn1jUATlZ3UhxMRsn8cT6Zd58,4551
7
+ synthneura/services/summary.py,sha256=LFyiipU7wa-0EEatjU_ajBZizaP51gfwyguAeqe1Db4,1936
8
+ synthneura/storage/__init__.py,sha256=q_9hlFMVlOjgUcyEKtSiCIFP96Wi1-IlTm_32hl8VIs,250
9
+ synthneura/storage/base.py,sha256=rc7J3VIOFiNhan0fu2gpdpgaXF8OkceVcqxOed4yzYg,795
10
+ synthneura/storage/sqlite_sink.py,sha256=h0D-6mPpVfjCoiMRan1jtURRK6KxxiaX-cC_lUucXP0,17982
11
+ synthneura/ui/cli.py,sha256=6rVv-kteR1cI9CqXinjmIuZ0E6oMCeWmIB_SiZZCjVk,9000
12
+ synthneura-2.0.0.dist-info/METADATA,sha256=Swx9TFRFvc5VeFJS5prqStIG-1rCr_PcRfwZTWP1nQk,204
13
+ synthneura-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ synthneura-2.0.0.dist-info/top_level.txt,sha256=6XwiS6dxlBMStS2v71SEMv8_Kba31uXwIbkIlgr4-Ic,11
15
+ synthneura-2.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ synthneura