celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/kb/governance.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enterprise governance layer: policy enforcement + audit logging.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import time
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _parse_csv(value: str | None) -> set[str]:
|
|
14
|
+
if not value:
|
|
15
|
+
return set()
|
|
16
|
+
return {item.strip() for item in str(value).split(",") if item.strip()}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GovernanceEngine:
|
|
20
|
+
"""Evaluates runtime policy and writes audit events."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, session, *, session_id: str):
|
|
23
|
+
self.session = session
|
|
24
|
+
self.session_id = session_id
|
|
25
|
+
self.audit_enabled = bool(session.config.get("enterprise.audit_enabled", True))
|
|
26
|
+
self.enforce_policy = bool(session.config.get("enterprise.enforce_policy", False))
|
|
27
|
+
audit_dir = Path(session.config.get("enterprise.audit_dir", str(Path.home() / ".ct" / "audit")))
|
|
28
|
+
self.audit_path = audit_dir / f"{session_id}.audit.jsonl"
|
|
29
|
+
|
|
30
|
+
def check_tool(self, tool_name: str) -> tuple[bool, str]:
|
|
31
|
+
"""Return whether tool execution is allowed under active policy."""
|
|
32
|
+
if not self.enforce_policy:
|
|
33
|
+
return True, ""
|
|
34
|
+
|
|
35
|
+
blocked_tools = _parse_csv(self.session.config.get("enterprise.blocked_tools", ""))
|
|
36
|
+
blocked_categories = _parse_csv(self.session.config.get("enterprise.blocked_categories", ""))
|
|
37
|
+
require_allow = bool(self.session.config.get("enterprise.require_tool_allowlist", False))
|
|
38
|
+
allowlist = _parse_csv(self.session.config.get("enterprise.tool_allowlist", ""))
|
|
39
|
+
|
|
40
|
+
category = tool_name.split(".", 1)[0] if "." in tool_name else tool_name
|
|
41
|
+
if tool_name in blocked_tools:
|
|
42
|
+
return False, f"Tool blocked by policy: {tool_name}"
|
|
43
|
+
if category in blocked_categories:
|
|
44
|
+
return False, f"Tool category blocked by policy: {category}"
|
|
45
|
+
if require_allow and tool_name not in allowlist:
|
|
46
|
+
return False, f"Tool not in enterprise allowlist: {tool_name}"
|
|
47
|
+
return True, ""
|
|
48
|
+
|
|
49
|
+
def apply_plan_policy(self, plan) -> dict[str, Any]:
|
|
50
|
+
"""Pre-flight policy validation for plan steps."""
|
|
51
|
+
blocked = []
|
|
52
|
+
for step in getattr(plan, "steps", []):
|
|
53
|
+
allowed, reason = self.check_tool(step.tool)
|
|
54
|
+
if allowed:
|
|
55
|
+
continue
|
|
56
|
+
step.status = "failed"
|
|
57
|
+
step.result = {"error": "blocked_by_policy", "summary": reason}
|
|
58
|
+
blocked.append({"step_id": step.id, "tool": step.tool, "reason": reason})
|
|
59
|
+
if blocked:
|
|
60
|
+
self.audit_event("plan_policy_block", {"blocked_steps": blocked})
|
|
61
|
+
return {"blocked_steps": blocked, "blocked_count": len(blocked)}
|
|
62
|
+
|
|
63
|
+
def query_start(self, *, query: str, context: dict[str, Any] | None = None):
|
|
64
|
+
self.audit_event(
|
|
65
|
+
"query_start",
|
|
66
|
+
{
|
|
67
|
+
"query": query,
|
|
68
|
+
"context_keys": sorted((context or {}).keys()),
|
|
69
|
+
"profile": self.session.config.get("agent.profile", "research"),
|
|
70
|
+
},
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def query_end(self, *, duration_s: float, iterations: int, total_steps: int):
|
|
74
|
+
max_cost = float(self.session.config.get("enterprise.max_cost_usd_per_query", 0.0) or 0.0)
|
|
75
|
+
llm = self.session.get_llm()
|
|
76
|
+
actual_cost = float(getattr(getattr(llm, "usage", None), "total_cost", 0.0) or 0.0)
|
|
77
|
+
exceeded_cost_budget = bool(max_cost > 0 and actual_cost > max_cost)
|
|
78
|
+
self.audit_event(
|
|
79
|
+
"query_end",
|
|
80
|
+
{
|
|
81
|
+
"duration_s": round(duration_s, 4),
|
|
82
|
+
"iterations": iterations,
|
|
83
|
+
"total_steps": total_steps,
|
|
84
|
+
"llm_cost_usd": round(actual_cost, 6),
|
|
85
|
+
"cost_budget_usd": max_cost,
|
|
86
|
+
"cost_budget_exceeded": exceeded_cost_budget,
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def audit_event(self, event_type: str, payload: dict[str, Any]):
|
|
91
|
+
"""Append an audit event."""
|
|
92
|
+
if not self.audit_enabled:
|
|
93
|
+
return
|
|
94
|
+
try:
|
|
95
|
+
self.audit_path.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
event = {
|
|
97
|
+
"timestamp": time.time(),
|
|
98
|
+
"session_id": self.session_id,
|
|
99
|
+
"event_type": event_type,
|
|
100
|
+
"payload": payload,
|
|
101
|
+
}
|
|
102
|
+
with open(self.audit_path, "a", encoding="utf-8") as f:
|
|
103
|
+
f.write(json.dumps(event) + "\n")
|
|
104
|
+
except OSError:
|
|
105
|
+
# Audit logging is best-effort; policy checks still run.
|
|
106
|
+
return
|
ct/kb/ingest.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge ingestion and normalization pipeline.
|
|
3
|
+
|
|
4
|
+
Builds the knowledge substrate from:
|
|
5
|
+
- local evidence logs (always available)
|
|
6
|
+
- optional live APIs (PubMed, OpenAlex, Open Targets)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import re
|
|
14
|
+
import time
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from ct.kb.substrate import KnowledgeSubstrate
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KnowledgeIngestionPipeline:
|
|
21
|
+
"""Incremental ingestion pipeline into the canonical knowledge substrate."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
substrate: KnowledgeSubstrate | None = None,
|
|
26
|
+
*,
|
|
27
|
+
state_path: Path | None = None,
|
|
28
|
+
evidence_path: Path | None = None,
|
|
29
|
+
):
|
|
30
|
+
self.substrate = substrate or KnowledgeSubstrate()
|
|
31
|
+
self.state_path = state_path or (Path.home() / ".ct" / "knowledge" / "ingest_state.json")
|
|
32
|
+
self.evidence_path = evidence_path or (Path.home() / ".ct" / "evidence" / "evidence.jsonl")
|
|
33
|
+
self._state = self._load_state()
|
|
34
|
+
|
|
35
|
+
def _default_state(self) -> dict[str, Any]:
|
|
36
|
+
return {
|
|
37
|
+
"updated_at": time.time(),
|
|
38
|
+
"evidence_line_offset": 0,
|
|
39
|
+
"source_runs": {},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def _load_state(self) -> dict[str, Any]:
|
|
43
|
+
if not self.state_path.exists():
|
|
44
|
+
return self._default_state()
|
|
45
|
+
try:
|
|
46
|
+
data = json.loads(self.state_path.read_text(encoding="utf-8"))
|
|
47
|
+
except (OSError, json.JSONDecodeError):
|
|
48
|
+
return self._default_state()
|
|
49
|
+
if not isinstance(data, dict):
|
|
50
|
+
return self._default_state()
|
|
51
|
+
data.setdefault("updated_at", time.time())
|
|
52
|
+
data.setdefault("evidence_line_offset", 0)
|
|
53
|
+
data.setdefault("source_runs", {})
|
|
54
|
+
return data
|
|
55
|
+
|
|
56
|
+
def save_state(self):
|
|
57
|
+
self.state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
self._state["updated_at"] = time.time()
|
|
59
|
+
self.state_path.write_text(
|
|
60
|
+
json.dumps(self._state, ensure_ascii=True, indent=2),
|
|
61
|
+
encoding="utf-8",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def ingest(
|
|
65
|
+
self,
|
|
66
|
+
*,
|
|
67
|
+
source: str,
|
|
68
|
+
query: str | None = None,
|
|
69
|
+
max_results: int = 10,
|
|
70
|
+
scan_limit: int = 1000,
|
|
71
|
+
) -> dict[str, Any]:
|
|
72
|
+
"""Ingest from one source into substrate."""
|
|
73
|
+
src = (source or "").strip().lower()
|
|
74
|
+
if src == "evidence_store":
|
|
75
|
+
return self.ingest_evidence_store(scan_limit=scan_limit)
|
|
76
|
+
if src == "pubmed":
|
|
77
|
+
if not query:
|
|
78
|
+
return {"error": "query is required for source=pubmed"}
|
|
79
|
+
return self.ingest_pubmed(query=query, max_results=max_results)
|
|
80
|
+
if src == "openalex":
|
|
81
|
+
if not query:
|
|
82
|
+
return {"error": "query is required for source=openalex"}
|
|
83
|
+
return self.ingest_openalex(query=query, max_results=max_results)
|
|
84
|
+
if src == "opentargets":
|
|
85
|
+
if not query:
|
|
86
|
+
return {"error": "query is required for source=opentargets"}
|
|
87
|
+
return self.ingest_opentargets(query=query)
|
|
88
|
+
return {"error": f"Unknown source '{source}'"}
|
|
89
|
+
|
|
90
|
+
def ingest_evidence_store(self, *, scan_limit: int = 1000) -> dict[str, Any]:
|
|
91
|
+
"""Ingest new rows from local evidence log."""
|
|
92
|
+
if not self.evidence_path.exists():
|
|
93
|
+
return {
|
|
94
|
+
"summary": "No local evidence store found.",
|
|
95
|
+
"source": "evidence_store",
|
|
96
|
+
"ingested_records": 0,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
lines = self.evidence_path.read_text(encoding="utf-8").splitlines()
|
|
101
|
+
except OSError as exc:
|
|
102
|
+
return {"error": f"Failed reading evidence store: {exc}"}
|
|
103
|
+
|
|
104
|
+
start = int(self._state.get("evidence_line_offset", 0))
|
|
105
|
+
if start >= len(lines):
|
|
106
|
+
return {
|
|
107
|
+
"summary": "No new evidence records to ingest.",
|
|
108
|
+
"source": "evidence_store",
|
|
109
|
+
"ingested_records": 0,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
new_lines = lines[start:][: max(scan_limit, 0)]
|
|
113
|
+
ingested = 0
|
|
114
|
+
linked_entities = 0
|
|
115
|
+
for line in new_lines:
|
|
116
|
+
try:
|
|
117
|
+
record = json.loads(line)
|
|
118
|
+
except json.JSONDecodeError:
|
|
119
|
+
continue
|
|
120
|
+
entities = self._ingest_evidence_record(record)
|
|
121
|
+
linked_entities += entities
|
|
122
|
+
ingested += 1
|
|
123
|
+
|
|
124
|
+
self._state["evidence_line_offset"] = start + len(new_lines)
|
|
125
|
+
self._state["source_runs"]["evidence_store"] = time.time()
|
|
126
|
+
self.save_state()
|
|
127
|
+
self.substrate.save()
|
|
128
|
+
return {
|
|
129
|
+
"summary": (
|
|
130
|
+
f"Ingested {ingested} evidence record(s) from local store; "
|
|
131
|
+
f"linked {linked_entities} entity mention(s)."
|
|
132
|
+
),
|
|
133
|
+
"source": "evidence_store",
|
|
134
|
+
"ingested_records": ingested,
|
|
135
|
+
"linked_entities": linked_entities,
|
|
136
|
+
"new_offset": self._state["evidence_line_offset"],
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
def _ingest_evidence_record(self, record: dict[str, Any]) -> int:
|
|
140
|
+
query = str(record.get("query", "")).strip()
|
|
141
|
+
synthesis = str(record.get("synthesis_preview", "")).strip()
|
|
142
|
+
session_id = str(record.get("session_id", "")).strip()
|
|
143
|
+
steps = record.get("steps", []) or []
|
|
144
|
+
|
|
145
|
+
query_entities = self._extract_entity_mentions(query)
|
|
146
|
+
if not query_entities and query:
|
|
147
|
+
query_entities = [query]
|
|
148
|
+
|
|
149
|
+
ev = self.substrate.add_evidence(
|
|
150
|
+
source_type="session",
|
|
151
|
+
source_ref=session_id or "unknown_session",
|
|
152
|
+
summary=synthesis or query,
|
|
153
|
+
score=0.6,
|
|
154
|
+
tags=["session", "evidence_store"],
|
|
155
|
+
metadata={"n_completed_steps": int(record.get("n_completed_steps", 0) or 0)},
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
entity_ids = []
|
|
159
|
+
for mention in query_entities:
|
|
160
|
+
entity_type = KnowledgeSubstrate.infer_entity_type(mention)
|
|
161
|
+
entity = self.substrate.upsert_entity(entity_type=entity_type, name=mention)
|
|
162
|
+
entity_ids.append(entity.id)
|
|
163
|
+
|
|
164
|
+
for i, left in enumerate(entity_ids):
|
|
165
|
+
for right in entity_ids[i + 1:]:
|
|
166
|
+
self.substrate.link_entities(
|
|
167
|
+
subject_id=left,
|
|
168
|
+
predicate="co_mentioned_in_query",
|
|
169
|
+
object_id=right,
|
|
170
|
+
evidence_id=ev.id,
|
|
171
|
+
polarity="support",
|
|
172
|
+
score=0.55,
|
|
173
|
+
metadata={"source": "query"},
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
linked = len(entity_ids)
|
|
177
|
+
for step in steps:
|
|
178
|
+
tool_name = str(step.get("tool", "")).strip()
|
|
179
|
+
step_desc = str(step.get("description", "")).strip()
|
|
180
|
+
step_summary = str(step.get("result_summary", "")).strip()
|
|
181
|
+
if not tool_name:
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
tool_entity = self.substrate.upsert_entity(
|
|
185
|
+
entity_type="tool",
|
|
186
|
+
name=tool_name,
|
|
187
|
+
identifier=tool_name,
|
|
188
|
+
)
|
|
189
|
+
step_ev = self.substrate.add_evidence(
|
|
190
|
+
source_type="tool",
|
|
191
|
+
source_ref=tool_name,
|
|
192
|
+
summary=(step_summary or step_desc)[:1200],
|
|
193
|
+
score=0.65,
|
|
194
|
+
tags=["step_result"],
|
|
195
|
+
metadata={"step_id": step.get("id"), "session_id": session_id},
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
for eid in entity_ids:
|
|
199
|
+
self.substrate.link_entities(
|
|
200
|
+
subject_id=eid,
|
|
201
|
+
predicate="analyzed_with",
|
|
202
|
+
object_id=tool_entity.id,
|
|
203
|
+
evidence_id=step_ev.id,
|
|
204
|
+
polarity="support",
|
|
205
|
+
score=0.65,
|
|
206
|
+
metadata={"step_id": step.get("id")},
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
step_entities = self._extract_entity_mentions(f"{step_desc} {step_summary}")
|
|
210
|
+
for mention in step_entities:
|
|
211
|
+
if mention in query_entities:
|
|
212
|
+
continue
|
|
213
|
+
se = self.substrate.upsert_entity(
|
|
214
|
+
entity_type=KnowledgeSubstrate.infer_entity_type(mention),
|
|
215
|
+
name=mention,
|
|
216
|
+
)
|
|
217
|
+
linked += 1
|
|
218
|
+
for eid in entity_ids:
|
|
219
|
+
self.substrate.link_entities(
|
|
220
|
+
subject_id=eid,
|
|
221
|
+
predicate="associated_with",
|
|
222
|
+
object_id=se.id,
|
|
223
|
+
evidence_id=step_ev.id,
|
|
224
|
+
polarity="support",
|
|
225
|
+
score=0.6,
|
|
226
|
+
)
|
|
227
|
+
return linked
|
|
228
|
+
|
|
229
|
+
def ingest_pubmed(self, *, query: str, max_results: int = 10) -> dict[str, Any]:
|
|
230
|
+
from ct.tools.literature import pubmed_search
|
|
231
|
+
|
|
232
|
+
result = pubmed_search(query=query, max_results=max_results)
|
|
233
|
+
if result.get("error"):
|
|
234
|
+
return {"error": result["error"], "source": "pubmed"}
|
|
235
|
+
|
|
236
|
+
articles = result.get("articles", []) or []
|
|
237
|
+
query_entities = self._get_or_create_query_entities(query)
|
|
238
|
+
n_links = 0
|
|
239
|
+
for art in articles:
|
|
240
|
+
pmid = str(art.get("pmid", "")).strip()
|
|
241
|
+
title = str(art.get("title", "")).strip()
|
|
242
|
+
if not pmid:
|
|
243
|
+
continue
|
|
244
|
+
pub = self.substrate.upsert_entity(
|
|
245
|
+
entity_type="publication",
|
|
246
|
+
name=title or f"PMID {pmid}",
|
|
247
|
+
identifier=f"PMID:{pmid}",
|
|
248
|
+
metadata={"pmid": pmid, "journal": art.get("journal", "")},
|
|
249
|
+
)
|
|
250
|
+
ev = self.substrate.add_evidence(
|
|
251
|
+
source_type="pubmed",
|
|
252
|
+
source_ref=f"PMID:{pmid}",
|
|
253
|
+
summary=title,
|
|
254
|
+
score=0.75,
|
|
255
|
+
tags=["literature"],
|
|
256
|
+
metadata={"year": art.get("publication_year")},
|
|
257
|
+
)
|
|
258
|
+
for eid in query_entities:
|
|
259
|
+
self.substrate.link_entities(
|
|
260
|
+
subject_id=eid,
|
|
261
|
+
predicate="supported_by_literature",
|
|
262
|
+
object_id=pub.id,
|
|
263
|
+
evidence_id=ev.id,
|
|
264
|
+
polarity="support",
|
|
265
|
+
score=0.75,
|
|
266
|
+
)
|
|
267
|
+
n_links += 1
|
|
268
|
+
self._state["source_runs"]["pubmed"] = time.time()
|
|
269
|
+
self.save_state()
|
|
270
|
+
self.substrate.save()
|
|
271
|
+
return {
|
|
272
|
+
"summary": f"Ingested {len(articles)} PubMed article(s) for '{query}'.",
|
|
273
|
+
"source": "pubmed",
|
|
274
|
+
"ingested_articles": len(articles),
|
|
275
|
+
"links_created": n_links,
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
def ingest_openalex(self, *, query: str, max_results: int = 10) -> dict[str, Any]:
|
|
279
|
+
from ct.tools.literature import openalex_search
|
|
280
|
+
|
|
281
|
+
result = openalex_search(query=query, max_results=max_results)
|
|
282
|
+
if result.get("error"):
|
|
283
|
+
return {"error": result["error"], "source": "openalex"}
|
|
284
|
+
|
|
285
|
+
articles = result.get("articles", []) or []
|
|
286
|
+
query_entities = self._get_or_create_query_entities(query)
|
|
287
|
+
n_links = 0
|
|
288
|
+
for art in articles:
|
|
289
|
+
doi = str(art.get("doi", "")).strip()
|
|
290
|
+
title = str(art.get("title", "")).strip()
|
|
291
|
+
if not doi and not title:
|
|
292
|
+
continue
|
|
293
|
+
pub_id = doi or title
|
|
294
|
+
pub = self.substrate.upsert_entity(
|
|
295
|
+
entity_type="publication",
|
|
296
|
+
name=title or pub_id,
|
|
297
|
+
identifier=pub_id,
|
|
298
|
+
metadata={
|
|
299
|
+
"doi": doi,
|
|
300
|
+
"source": art.get("source", ""),
|
|
301
|
+
"year": art.get("publication_year"),
|
|
302
|
+
"cited_by_count": art.get("cited_by_count", 0),
|
|
303
|
+
},
|
|
304
|
+
)
|
|
305
|
+
ev = self.substrate.add_evidence(
|
|
306
|
+
source_type="openalex",
|
|
307
|
+
source_ref=pub_id,
|
|
308
|
+
summary=title,
|
|
309
|
+
score=0.72,
|
|
310
|
+
tags=["literature"],
|
|
311
|
+
)
|
|
312
|
+
for eid in query_entities:
|
|
313
|
+
self.substrate.link_entities(
|
|
314
|
+
subject_id=eid,
|
|
315
|
+
predicate="supported_by_literature",
|
|
316
|
+
object_id=pub.id,
|
|
317
|
+
evidence_id=ev.id,
|
|
318
|
+
polarity="support",
|
|
319
|
+
score=0.72,
|
|
320
|
+
)
|
|
321
|
+
n_links += 1
|
|
322
|
+
self._state["source_runs"]["openalex"] = time.time()
|
|
323
|
+
self.save_state()
|
|
324
|
+
self.substrate.save()
|
|
325
|
+
return {
|
|
326
|
+
"summary": f"Ingested {len(articles)} OpenAlex work(s) for '{query}'.",
|
|
327
|
+
"source": "openalex",
|
|
328
|
+
"ingested_works": len(articles),
|
|
329
|
+
"links_created": n_links,
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
def ingest_opentargets(self, *, query: str) -> dict[str, Any]:
|
|
333
|
+
from ct.tools.data_api import opentargets_search
|
|
334
|
+
|
|
335
|
+
result = opentargets_search(query=query, entity_type="target")
|
|
336
|
+
if result.get("error"):
|
|
337
|
+
return {"error": result["error"], "source": "opentargets"}
|
|
338
|
+
|
|
339
|
+
target_name = str(result.get("name", query)).strip() or query
|
|
340
|
+
target_symbol = str(result.get("symbol", "")).strip()
|
|
341
|
+
target_key = target_symbol or target_name
|
|
342
|
+
target = self.substrate.upsert_entity(
|
|
343
|
+
entity_type="gene",
|
|
344
|
+
name=target_name,
|
|
345
|
+
identifier=target_key,
|
|
346
|
+
synonyms=[target_symbol] if target_symbol else [],
|
|
347
|
+
metadata={"opentargets_id": result.get("entity_id", "")},
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
associations = result.get("top_disease_associations", []) or result.get("associations", []) or []
|
|
351
|
+
created = 0
|
|
352
|
+
for assoc in associations[:20]:
|
|
353
|
+
disease_name = str(assoc.get("disease_name") or assoc.get("disease", "")).strip()
|
|
354
|
+
if not disease_name:
|
|
355
|
+
continue
|
|
356
|
+
disease = self.substrate.upsert_entity(entity_type="disease", name=disease_name)
|
|
357
|
+
score = float(assoc.get("overall_score", 0.5) or 0.5)
|
|
358
|
+
ev = self.substrate.add_evidence(
|
|
359
|
+
source_type="opentargets",
|
|
360
|
+
source_ref=str(result.get("entity_id", "")),
|
|
361
|
+
summary=f"{target_name} association with {disease_name}",
|
|
362
|
+
score=max(0.4, min(score, 1.0)),
|
|
363
|
+
tags=["genetics", "target_disease"],
|
|
364
|
+
metadata={"association_score": score},
|
|
365
|
+
)
|
|
366
|
+
self.substrate.link_entities(
|
|
367
|
+
subject_id=target.id,
|
|
368
|
+
predicate="associated_with_disease",
|
|
369
|
+
object_id=disease.id,
|
|
370
|
+
evidence_id=ev.id,
|
|
371
|
+
polarity="support",
|
|
372
|
+
score=max(0.4, min(score, 1.0)),
|
|
373
|
+
metadata={"source": "opentargets"},
|
|
374
|
+
)
|
|
375
|
+
created += 1
|
|
376
|
+
|
|
377
|
+
self._state["source_runs"]["opentargets"] = time.time()
|
|
378
|
+
self.save_state()
|
|
379
|
+
self.substrate.save()
|
|
380
|
+
return {
|
|
381
|
+
"summary": f"Ingested Open Targets associations for '{query}' ({created} relation(s)).",
|
|
382
|
+
"source": "opentargets",
|
|
383
|
+
"relations_created": created,
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
def _get_or_create_query_entities(self, query: str) -> list[str]:
|
|
387
|
+
mentions = self._extract_entity_mentions(query)
|
|
388
|
+
if not mentions and query:
|
|
389
|
+
mentions = [query]
|
|
390
|
+
ids = []
|
|
391
|
+
for mention in mentions:
|
|
392
|
+
entity = self.substrate.upsert_entity(
|
|
393
|
+
entity_type=KnowledgeSubstrate.infer_entity_type(mention),
|
|
394
|
+
name=mention,
|
|
395
|
+
)
|
|
396
|
+
ids.append(entity.id)
|
|
397
|
+
return ids
|
|
398
|
+
|
|
399
|
+
def _extract_entity_mentions(self, text: str) -> list[str]:
|
|
400
|
+
mentions = []
|
|
401
|
+
# Add PMID/NCT mentions if present.
|
|
402
|
+
mentions.extend(re.findall(r"\bPMID[:\s]?\d+\b", text or "", flags=re.IGNORECASE))
|
|
403
|
+
mentions.extend(re.findall(r"\bNCT\d{8}\b", text or "", flags=re.IGNORECASE))
|
|
404
|
+
dedup = []
|
|
405
|
+
seen = set()
|
|
406
|
+
for m in mentions:
|
|
407
|
+
norm = m.strip()
|
|
408
|
+
if not norm:
|
|
409
|
+
continue
|
|
410
|
+
key = norm.lower()
|
|
411
|
+
if key in seen:
|
|
412
|
+
continue
|
|
413
|
+
seen.add(key)
|
|
414
|
+
dedup.append(norm)
|
|
415
|
+
return dedup
|
ct/kb/reasoning.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evidence ranking and contradiction analysis over the knowledge substrate.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from ct.kb.substrate import KBClaim, KBRelation, KnowledgeSubstrate
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
SOURCE_WEIGHTS = {
|
|
15
|
+
"pubmed": 0.9,
|
|
16
|
+
"openalex": 0.85,
|
|
17
|
+
"opentargets": 0.92,
|
|
18
|
+
"tool": 0.75,
|
|
19
|
+
"session": 0.65,
|
|
20
|
+
"unknown": 0.5,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _recency_weight(ts: float, now: float) -> float:
|
|
25
|
+
age_days = max((now - ts) / 86400.0, 0.0)
|
|
26
|
+
return math.exp(-age_days / 365.0)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EvidenceReasoner:
|
|
30
|
+
"""Ranking and contradiction detector."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, substrate: KnowledgeSubstrate):
|
|
33
|
+
self.substrate = substrate
|
|
34
|
+
|
|
35
|
+
def relation_score(self, relation: KBRelation, *, now: float | None = None) -> float:
|
|
36
|
+
"""Aggregate weighted confidence for relation claims."""
|
|
37
|
+
now = now or time.time()
|
|
38
|
+
if not relation.claims:
|
|
39
|
+
return 0.0
|
|
40
|
+
weighted = []
|
|
41
|
+
for claim in relation.claims:
|
|
42
|
+
evidence = self.substrate.get_evidence(claim.evidence_id)
|
|
43
|
+
source_weight = SOURCE_WEIGHTS.get(
|
|
44
|
+
(evidence.source_type if evidence else "unknown"),
|
|
45
|
+
SOURCE_WEIGHTS["unknown"],
|
|
46
|
+
)
|
|
47
|
+
recency = _recency_weight(claim.timestamp, now)
|
|
48
|
+
polarity = 1.0 if claim.polarity == "support" else (-1.0 if claim.polarity == "contradict" else 0.2)
|
|
49
|
+
score = claim.score * source_weight * recency * polarity
|
|
50
|
+
weighted.append(score)
|
|
51
|
+
return sum(weighted) / max(len(weighted), 1)
|
|
52
|
+
|
|
53
|
+
def rank_relations(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
entity_id: str | None = None,
|
|
57
|
+
predicate: str | None = None,
|
|
58
|
+
limit: int = 20,
|
|
59
|
+
) -> list[dict[str, Any]]:
|
|
60
|
+
"""Return highest-confidence relations."""
|
|
61
|
+
rows = []
|
|
62
|
+
for relation in self.substrate.list_relations():
|
|
63
|
+
if entity_id and relation.subject_id != entity_id and relation.object_id != entity_id:
|
|
64
|
+
continue
|
|
65
|
+
if predicate and relation.predicate != predicate:
|
|
66
|
+
continue
|
|
67
|
+
score = self.relation_score(relation)
|
|
68
|
+
rows.append(
|
|
69
|
+
{
|
|
70
|
+
"relation_id": relation.id,
|
|
71
|
+
"subject_id": relation.subject_id,
|
|
72
|
+
"predicate": relation.predicate,
|
|
73
|
+
"object_id": relation.object_id,
|
|
74
|
+
"score": round(score, 4),
|
|
75
|
+
"n_claims": len(relation.claims),
|
|
76
|
+
"last_seen": relation.last_seen,
|
|
77
|
+
}
|
|
78
|
+
)
|
|
79
|
+
rows.sort(key=lambda r: (r["score"], r["n_claims"], r["last_seen"]), reverse=True)
|
|
80
|
+
return rows[: max(limit, 0)]
|
|
81
|
+
|
|
82
|
+
def detect_contradictions(
|
|
83
|
+
self,
|
|
84
|
+
*,
|
|
85
|
+
entity_id: str | None = None,
|
|
86
|
+
predicate: str | None = None,
|
|
87
|
+
min_claims: int = 2,
|
|
88
|
+
) -> list[dict[str, Any]]:
|
|
89
|
+
"""Find relations with mixed support and contradiction evidence."""
|
|
90
|
+
contradictions = []
|
|
91
|
+
for relation in self.substrate.list_relations():
|
|
92
|
+
if entity_id and relation.subject_id != entity_id and relation.object_id != entity_id:
|
|
93
|
+
continue
|
|
94
|
+
if predicate and relation.predicate != predicate:
|
|
95
|
+
continue
|
|
96
|
+
if len(relation.claims) < min_claims:
|
|
97
|
+
continue
|
|
98
|
+
support = [c for c in relation.claims if c.polarity == "support"]
|
|
99
|
+
contradict = [c for c in relation.claims if c.polarity == "contradict"]
|
|
100
|
+
if not support or not contradict:
|
|
101
|
+
continue
|
|
102
|
+
contradictions.append(
|
|
103
|
+
{
|
|
104
|
+
"relation_id": relation.id,
|
|
105
|
+
"subject_id": relation.subject_id,
|
|
106
|
+
"predicate": relation.predicate,
|
|
107
|
+
"object_id": relation.object_id,
|
|
108
|
+
"support_claims": len(support),
|
|
109
|
+
"contradict_claims": len(contradict),
|
|
110
|
+
"support_score": round(self._avg_claim_score(support), 4),
|
|
111
|
+
"contradict_score": round(self._avg_claim_score(contradict), 4),
|
|
112
|
+
"last_seen": relation.last_seen,
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
contradictions.sort(
|
|
116
|
+
key=lambda c: (
|
|
117
|
+
min(c["support_claims"], c["contradict_claims"]),
|
|
118
|
+
max(c["support_score"], c["contradict_score"]),
|
|
119
|
+
c["last_seen"],
|
|
120
|
+
),
|
|
121
|
+
reverse=True,
|
|
122
|
+
)
|
|
123
|
+
return contradictions
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _avg_claim_score(claims: list[KBClaim]) -> float:
|
|
127
|
+
if not claims:
|
|
128
|
+
return 0.0
|
|
129
|
+
return sum(c.score for c in claims) / len(claims)
|