celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/kb/governance.py ADDED
@@ -0,0 +1,106 @@
1
+ """
2
+ Enterprise governance layer: policy enforcement + audit logging.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from pathlib import Path
9
+ import time
10
+ from typing import Any
11
+
12
+
13
+ def _parse_csv(value: str | None) -> set[str]:
14
+ if not value:
15
+ return set()
16
+ return {item.strip() for item in str(value).split(",") if item.strip()}
17
+
18
+
19
+ class GovernanceEngine:
20
+ """Evaluates runtime policy and writes audit events."""
21
+
22
+ def __init__(self, session, *, session_id: str):
23
+ self.session = session
24
+ self.session_id = session_id
25
+ self.audit_enabled = bool(session.config.get("enterprise.audit_enabled", True))
26
+ self.enforce_policy = bool(session.config.get("enterprise.enforce_policy", False))
27
+ audit_dir = Path(session.config.get("enterprise.audit_dir", str(Path.home() / ".ct" / "audit")))
28
+ self.audit_path = audit_dir / f"{session_id}.audit.jsonl"
29
+
30
+ def check_tool(self, tool_name: str) -> tuple[bool, str]:
31
+ """Return whether tool execution is allowed under active policy."""
32
+ if not self.enforce_policy:
33
+ return True, ""
34
+
35
+ blocked_tools = _parse_csv(self.session.config.get("enterprise.blocked_tools", ""))
36
+ blocked_categories = _parse_csv(self.session.config.get("enterprise.blocked_categories", ""))
37
+ require_allow = bool(self.session.config.get("enterprise.require_tool_allowlist", False))
38
+ allowlist = _parse_csv(self.session.config.get("enterprise.tool_allowlist", ""))
39
+
40
+ category = tool_name.split(".", 1)[0] if "." in tool_name else tool_name
41
+ if tool_name in blocked_tools:
42
+ return False, f"Tool blocked by policy: {tool_name}"
43
+ if category in blocked_categories:
44
+ return False, f"Tool category blocked by policy: {category}"
45
+ if require_allow and tool_name not in allowlist:
46
+ return False, f"Tool not in enterprise allowlist: {tool_name}"
47
+ return True, ""
48
+
49
+ def apply_plan_policy(self, plan) -> dict[str, Any]:
50
+ """Pre-flight policy validation for plan steps."""
51
+ blocked = []
52
+ for step in getattr(plan, "steps", []):
53
+ allowed, reason = self.check_tool(step.tool)
54
+ if allowed:
55
+ continue
56
+ step.status = "failed"
57
+ step.result = {"error": "blocked_by_policy", "summary": reason}
58
+ blocked.append({"step_id": step.id, "tool": step.tool, "reason": reason})
59
+ if blocked:
60
+ self.audit_event("plan_policy_block", {"blocked_steps": blocked})
61
+ return {"blocked_steps": blocked, "blocked_count": len(blocked)}
62
+
63
+ def query_start(self, *, query: str, context: dict[str, Any] | None = None):
64
+ self.audit_event(
65
+ "query_start",
66
+ {
67
+ "query": query,
68
+ "context_keys": sorted((context or {}).keys()),
69
+ "profile": self.session.config.get("agent.profile", "research"),
70
+ },
71
+ )
72
+
73
+ def query_end(self, *, duration_s: float, iterations: int, total_steps: int):
74
+ max_cost = float(self.session.config.get("enterprise.max_cost_usd_per_query", 0.0) or 0.0)
75
+ llm = self.session.get_llm()
76
+ actual_cost = float(getattr(getattr(llm, "usage", None), "total_cost", 0.0) or 0.0)
77
+ exceeded_cost_budget = bool(max_cost > 0 and actual_cost > max_cost)
78
+ self.audit_event(
79
+ "query_end",
80
+ {
81
+ "duration_s": round(duration_s, 4),
82
+ "iterations": iterations,
83
+ "total_steps": total_steps,
84
+ "llm_cost_usd": round(actual_cost, 6),
85
+ "cost_budget_usd": max_cost,
86
+ "cost_budget_exceeded": exceeded_cost_budget,
87
+ },
88
+ )
89
+
90
+ def audit_event(self, event_type: str, payload: dict[str, Any]):
91
+ """Append an audit event."""
92
+ if not self.audit_enabled:
93
+ return
94
+ try:
95
+ self.audit_path.parent.mkdir(parents=True, exist_ok=True)
96
+ event = {
97
+ "timestamp": time.time(),
98
+ "session_id": self.session_id,
99
+ "event_type": event_type,
100
+ "payload": payload,
101
+ }
102
+ with open(self.audit_path, "a", encoding="utf-8") as f:
103
+ f.write(json.dumps(event) + "\n")
104
+ except OSError:
105
+ # Audit logging is best-effort; policy checks still run.
106
+ return
ct/kb/ingest.py ADDED
@@ -0,0 +1,415 @@
1
+ """
2
+ Knowledge ingestion and normalization pipeline.
3
+
4
+ Builds the knowledge substrate from:
5
+ - local evidence logs (always available)
6
+ - optional live APIs (PubMed, OpenAlex, Open Targets)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from pathlib import Path
13
+ import re
14
+ import time
15
+ from typing import Any
16
+
17
+ from ct.kb.substrate import KnowledgeSubstrate
18
+
19
+
20
+ class KnowledgeIngestionPipeline:
21
+ """Incremental ingestion pipeline into the canonical knowledge substrate."""
22
+
23
+ def __init__(
24
+ self,
25
+ substrate: KnowledgeSubstrate | None = None,
26
+ *,
27
+ state_path: Path | None = None,
28
+ evidence_path: Path | None = None,
29
+ ):
30
+ self.substrate = substrate or KnowledgeSubstrate()
31
+ self.state_path = state_path or (Path.home() / ".ct" / "knowledge" / "ingest_state.json")
32
+ self.evidence_path = evidence_path or (Path.home() / ".ct" / "evidence" / "evidence.jsonl")
33
+ self._state = self._load_state()
34
+
35
+ def _default_state(self) -> dict[str, Any]:
36
+ return {
37
+ "updated_at": time.time(),
38
+ "evidence_line_offset": 0,
39
+ "source_runs": {},
40
+ }
41
+
42
+ def _load_state(self) -> dict[str, Any]:
43
+ if not self.state_path.exists():
44
+ return self._default_state()
45
+ try:
46
+ data = json.loads(self.state_path.read_text(encoding="utf-8"))
47
+ except (OSError, json.JSONDecodeError):
48
+ return self._default_state()
49
+ if not isinstance(data, dict):
50
+ return self._default_state()
51
+ data.setdefault("updated_at", time.time())
52
+ data.setdefault("evidence_line_offset", 0)
53
+ data.setdefault("source_runs", {})
54
+ return data
55
+
56
+ def save_state(self):
57
+ self.state_path.parent.mkdir(parents=True, exist_ok=True)
58
+ self._state["updated_at"] = time.time()
59
+ self.state_path.write_text(
60
+ json.dumps(self._state, ensure_ascii=True, indent=2),
61
+ encoding="utf-8",
62
+ )
63
+
64
+ def ingest(
65
+ self,
66
+ *,
67
+ source: str,
68
+ query: str | None = None,
69
+ max_results: int = 10,
70
+ scan_limit: int = 1000,
71
+ ) -> dict[str, Any]:
72
+ """Ingest from one source into substrate."""
73
+ src = (source or "").strip().lower()
74
+ if src == "evidence_store":
75
+ return self.ingest_evidence_store(scan_limit=scan_limit)
76
+ if src == "pubmed":
77
+ if not query:
78
+ return {"error": "query is required for source=pubmed"}
79
+ return self.ingest_pubmed(query=query, max_results=max_results)
80
+ if src == "openalex":
81
+ if not query:
82
+ return {"error": "query is required for source=openalex"}
83
+ return self.ingest_openalex(query=query, max_results=max_results)
84
+ if src == "opentargets":
85
+ if not query:
86
+ return {"error": "query is required for source=opentargets"}
87
+ return self.ingest_opentargets(query=query)
88
+ return {"error": f"Unknown source '{source}'"}
89
+
90
+ def ingest_evidence_store(self, *, scan_limit: int = 1000) -> dict[str, Any]:
91
+ """Ingest new rows from local evidence log."""
92
+ if not self.evidence_path.exists():
93
+ return {
94
+ "summary": "No local evidence store found.",
95
+ "source": "evidence_store",
96
+ "ingested_records": 0,
97
+ }
98
+
99
+ try:
100
+ lines = self.evidence_path.read_text(encoding="utf-8").splitlines()
101
+ except OSError as exc:
102
+ return {"error": f"Failed reading evidence store: {exc}"}
103
+
104
+ start = int(self._state.get("evidence_line_offset", 0))
105
+ if start >= len(lines):
106
+ return {
107
+ "summary": "No new evidence records to ingest.",
108
+ "source": "evidence_store",
109
+ "ingested_records": 0,
110
+ }
111
+
112
+ new_lines = lines[start:][: max(scan_limit, 0)]
113
+ ingested = 0
114
+ linked_entities = 0
115
+ for line in new_lines:
116
+ try:
117
+ record = json.loads(line)
118
+ except json.JSONDecodeError:
119
+ continue
120
+ entities = self._ingest_evidence_record(record)
121
+ linked_entities += entities
122
+ ingested += 1
123
+
124
+ self._state["evidence_line_offset"] = start + len(new_lines)
125
+ self._state["source_runs"]["evidence_store"] = time.time()
126
+ self.save_state()
127
+ self.substrate.save()
128
+ return {
129
+ "summary": (
130
+ f"Ingested {ingested} evidence record(s) from local store; "
131
+ f"linked {linked_entities} entity mention(s)."
132
+ ),
133
+ "source": "evidence_store",
134
+ "ingested_records": ingested,
135
+ "linked_entities": linked_entities,
136
+ "new_offset": self._state["evidence_line_offset"],
137
+ }
138
+
139
+ def _ingest_evidence_record(self, record: dict[str, Any]) -> int:
140
+ query = str(record.get("query", "")).strip()
141
+ synthesis = str(record.get("synthesis_preview", "")).strip()
142
+ session_id = str(record.get("session_id", "")).strip()
143
+ steps = record.get("steps", []) or []
144
+
145
+ query_entities = self._extract_entity_mentions(query)
146
+ if not query_entities and query:
147
+ query_entities = [query]
148
+
149
+ ev = self.substrate.add_evidence(
150
+ source_type="session",
151
+ source_ref=session_id or "unknown_session",
152
+ summary=synthesis or query,
153
+ score=0.6,
154
+ tags=["session", "evidence_store"],
155
+ metadata={"n_completed_steps": int(record.get("n_completed_steps", 0) or 0)},
156
+ )
157
+
158
+ entity_ids = []
159
+ for mention in query_entities:
160
+ entity_type = KnowledgeSubstrate.infer_entity_type(mention)
161
+ entity = self.substrate.upsert_entity(entity_type=entity_type, name=mention)
162
+ entity_ids.append(entity.id)
163
+
164
+ for i, left in enumerate(entity_ids):
165
+ for right in entity_ids[i + 1:]:
166
+ self.substrate.link_entities(
167
+ subject_id=left,
168
+ predicate="co_mentioned_in_query",
169
+ object_id=right,
170
+ evidence_id=ev.id,
171
+ polarity="support",
172
+ score=0.55,
173
+ metadata={"source": "query"},
174
+ )
175
+
176
+ linked = len(entity_ids)
177
+ for step in steps:
178
+ tool_name = str(step.get("tool", "")).strip()
179
+ step_desc = str(step.get("description", "")).strip()
180
+ step_summary = str(step.get("result_summary", "")).strip()
181
+ if not tool_name:
182
+ continue
183
+
184
+ tool_entity = self.substrate.upsert_entity(
185
+ entity_type="tool",
186
+ name=tool_name,
187
+ identifier=tool_name,
188
+ )
189
+ step_ev = self.substrate.add_evidence(
190
+ source_type="tool",
191
+ source_ref=tool_name,
192
+ summary=(step_summary or step_desc)[:1200],
193
+ score=0.65,
194
+ tags=["step_result"],
195
+ metadata={"step_id": step.get("id"), "session_id": session_id},
196
+ )
197
+
198
+ for eid in entity_ids:
199
+ self.substrate.link_entities(
200
+ subject_id=eid,
201
+ predicate="analyzed_with",
202
+ object_id=tool_entity.id,
203
+ evidence_id=step_ev.id,
204
+ polarity="support",
205
+ score=0.65,
206
+ metadata={"step_id": step.get("id")},
207
+ )
208
+
209
+ step_entities = self._extract_entity_mentions(f"{step_desc} {step_summary}")
210
+ for mention in step_entities:
211
+ if mention in query_entities:
212
+ continue
213
+ se = self.substrate.upsert_entity(
214
+ entity_type=KnowledgeSubstrate.infer_entity_type(mention),
215
+ name=mention,
216
+ )
217
+ linked += 1
218
+ for eid in entity_ids:
219
+ self.substrate.link_entities(
220
+ subject_id=eid,
221
+ predicate="associated_with",
222
+ object_id=se.id,
223
+ evidence_id=step_ev.id,
224
+ polarity="support",
225
+ score=0.6,
226
+ )
227
+ return linked
228
+
229
+ def ingest_pubmed(self, *, query: str, max_results: int = 10) -> dict[str, Any]:
230
+ from ct.tools.literature import pubmed_search
231
+
232
+ result = pubmed_search(query=query, max_results=max_results)
233
+ if result.get("error"):
234
+ return {"error": result["error"], "source": "pubmed"}
235
+
236
+ articles = result.get("articles", []) or []
237
+ query_entities = self._get_or_create_query_entities(query)
238
+ n_links = 0
239
+ for art in articles:
240
+ pmid = str(art.get("pmid", "")).strip()
241
+ title = str(art.get("title", "")).strip()
242
+ if not pmid:
243
+ continue
244
+ pub = self.substrate.upsert_entity(
245
+ entity_type="publication",
246
+ name=title or f"PMID {pmid}",
247
+ identifier=f"PMID:{pmid}",
248
+ metadata={"pmid": pmid, "journal": art.get("journal", "")},
249
+ )
250
+ ev = self.substrate.add_evidence(
251
+ source_type="pubmed",
252
+ source_ref=f"PMID:{pmid}",
253
+ summary=title,
254
+ score=0.75,
255
+ tags=["literature"],
256
+ metadata={"year": art.get("publication_year")},
257
+ )
258
+ for eid in query_entities:
259
+ self.substrate.link_entities(
260
+ subject_id=eid,
261
+ predicate="supported_by_literature",
262
+ object_id=pub.id,
263
+ evidence_id=ev.id,
264
+ polarity="support",
265
+ score=0.75,
266
+ )
267
+ n_links += 1
268
+ self._state["source_runs"]["pubmed"] = time.time()
269
+ self.save_state()
270
+ self.substrate.save()
271
+ return {
272
+ "summary": f"Ingested {len(articles)} PubMed article(s) for '{query}'.",
273
+ "source": "pubmed",
274
+ "ingested_articles": len(articles),
275
+ "links_created": n_links,
276
+ }
277
+
278
+ def ingest_openalex(self, *, query: str, max_results: int = 10) -> dict[str, Any]:
279
+ from ct.tools.literature import openalex_search
280
+
281
+ result = openalex_search(query=query, max_results=max_results)
282
+ if result.get("error"):
283
+ return {"error": result["error"], "source": "openalex"}
284
+
285
+ articles = result.get("articles", []) or []
286
+ query_entities = self._get_or_create_query_entities(query)
287
+ n_links = 0
288
+ for art in articles:
289
+ doi = str(art.get("doi", "")).strip()
290
+ title = str(art.get("title", "")).strip()
291
+ if not doi and not title:
292
+ continue
293
+ pub_id = doi or title
294
+ pub = self.substrate.upsert_entity(
295
+ entity_type="publication",
296
+ name=title or pub_id,
297
+ identifier=pub_id,
298
+ metadata={
299
+ "doi": doi,
300
+ "source": art.get("source", ""),
301
+ "year": art.get("publication_year"),
302
+ "cited_by_count": art.get("cited_by_count", 0),
303
+ },
304
+ )
305
+ ev = self.substrate.add_evidence(
306
+ source_type="openalex",
307
+ source_ref=pub_id,
308
+ summary=title,
309
+ score=0.72,
310
+ tags=["literature"],
311
+ )
312
+ for eid in query_entities:
313
+ self.substrate.link_entities(
314
+ subject_id=eid,
315
+ predicate="supported_by_literature",
316
+ object_id=pub.id,
317
+ evidence_id=ev.id,
318
+ polarity="support",
319
+ score=0.72,
320
+ )
321
+ n_links += 1
322
+ self._state["source_runs"]["openalex"] = time.time()
323
+ self.save_state()
324
+ self.substrate.save()
325
+ return {
326
+ "summary": f"Ingested {len(articles)} OpenAlex work(s) for '{query}'.",
327
+ "source": "openalex",
328
+ "ingested_works": len(articles),
329
+ "links_created": n_links,
330
+ }
331
+
332
+ def ingest_opentargets(self, *, query: str) -> dict[str, Any]:
333
+ from ct.tools.data_api import opentargets_search
334
+
335
+ result = opentargets_search(query=query, entity_type="target")
336
+ if result.get("error"):
337
+ return {"error": result["error"], "source": "opentargets"}
338
+
339
+ target_name = str(result.get("name", query)).strip() or query
340
+ target_symbol = str(result.get("symbol", "")).strip()
341
+ target_key = target_symbol or target_name
342
+ target = self.substrate.upsert_entity(
343
+ entity_type="gene",
344
+ name=target_name,
345
+ identifier=target_key,
346
+ synonyms=[target_symbol] if target_symbol else [],
347
+ metadata={"opentargets_id": result.get("entity_id", "")},
348
+ )
349
+
350
+ associations = result.get("top_disease_associations", []) or result.get("associations", []) or []
351
+ created = 0
352
+ for assoc in associations[:20]:
353
+ disease_name = str(assoc.get("disease_name") or assoc.get("disease", "")).strip()
354
+ if not disease_name:
355
+ continue
356
+ disease = self.substrate.upsert_entity(entity_type="disease", name=disease_name)
357
+ score = float(assoc.get("overall_score", 0.5) or 0.5)
358
+ ev = self.substrate.add_evidence(
359
+ source_type="opentargets",
360
+ source_ref=str(result.get("entity_id", "")),
361
+ summary=f"{target_name} association with {disease_name}",
362
+ score=max(0.4, min(score, 1.0)),
363
+ tags=["genetics", "target_disease"],
364
+ metadata={"association_score": score},
365
+ )
366
+ self.substrate.link_entities(
367
+ subject_id=target.id,
368
+ predicate="associated_with_disease",
369
+ object_id=disease.id,
370
+ evidence_id=ev.id,
371
+ polarity="support",
372
+ score=max(0.4, min(score, 1.0)),
373
+ metadata={"source": "opentargets"},
374
+ )
375
+ created += 1
376
+
377
+ self._state["source_runs"]["opentargets"] = time.time()
378
+ self.save_state()
379
+ self.substrate.save()
380
+ return {
381
+ "summary": f"Ingested Open Targets associations for '{query}' ({created} relation(s)).",
382
+ "source": "opentargets",
383
+ "relations_created": created,
384
+ }
385
+
386
+ def _get_or_create_query_entities(self, query: str) -> list[str]:
387
+ mentions = self._extract_entity_mentions(query)
388
+ if not mentions and query:
389
+ mentions = [query]
390
+ ids = []
391
+ for mention in mentions:
392
+ entity = self.substrate.upsert_entity(
393
+ entity_type=KnowledgeSubstrate.infer_entity_type(mention),
394
+ name=mention,
395
+ )
396
+ ids.append(entity.id)
397
+ return ids
398
+
399
+ def _extract_entity_mentions(self, text: str) -> list[str]:
400
+ mentions = []
401
+ # Add PMID/NCT mentions if present.
402
+ mentions.extend(re.findall(r"\bPMID[:\s]?\d+\b", text or "", flags=re.IGNORECASE))
403
+ mentions.extend(re.findall(r"\bNCT\d{8}\b", text or "", flags=re.IGNORECASE))
404
+ dedup = []
405
+ seen = set()
406
+ for m in mentions:
407
+ norm = m.strip()
408
+ if not norm:
409
+ continue
410
+ key = norm.lower()
411
+ if key in seen:
412
+ continue
413
+ seen.add(key)
414
+ dedup.append(norm)
415
+ return dedup
ct/kb/reasoning.py ADDED
@@ -0,0 +1,129 @@
1
+ """
2
+ Evidence ranking and contradiction analysis over the knowledge substrate.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ import time
9
+ from typing import Any
10
+
11
+ from ct.kb.substrate import KBClaim, KBRelation, KnowledgeSubstrate
12
+
13
+
14
+ SOURCE_WEIGHTS = {
15
+ "pubmed": 0.9,
16
+ "openalex": 0.85,
17
+ "opentargets": 0.92,
18
+ "tool": 0.75,
19
+ "session": 0.65,
20
+ "unknown": 0.5,
21
+ }
22
+
23
+
24
+ def _recency_weight(ts: float, now: float) -> float:
25
+ age_days = max((now - ts) / 86400.0, 0.0)
26
+ return math.exp(-age_days / 365.0)
27
+
28
+
29
+ class EvidenceReasoner:
30
+ """Ranking and contradiction detector."""
31
+
32
+ def __init__(self, substrate: KnowledgeSubstrate):
33
+ self.substrate = substrate
34
+
35
+ def relation_score(self, relation: KBRelation, *, now: float | None = None) -> float:
36
+ """Aggregate weighted confidence for relation claims."""
37
+ now = now or time.time()
38
+ if not relation.claims:
39
+ return 0.0
40
+ weighted = []
41
+ for claim in relation.claims:
42
+ evidence = self.substrate.get_evidence(claim.evidence_id)
43
+ source_weight = SOURCE_WEIGHTS.get(
44
+ (evidence.source_type if evidence else "unknown"),
45
+ SOURCE_WEIGHTS["unknown"],
46
+ )
47
+ recency = _recency_weight(claim.timestamp, now)
48
+ polarity = 1.0 if claim.polarity == "support" else (-1.0 if claim.polarity == "contradict" else 0.2)
49
+ score = claim.score * source_weight * recency * polarity
50
+ weighted.append(score)
51
+ return sum(weighted) / max(len(weighted), 1)
52
+
53
+ def rank_relations(
54
+ self,
55
+ *,
56
+ entity_id: str | None = None,
57
+ predicate: str | None = None,
58
+ limit: int = 20,
59
+ ) -> list[dict[str, Any]]:
60
+ """Return highest-confidence relations."""
61
+ rows = []
62
+ for relation in self.substrate.list_relations():
63
+ if entity_id and relation.subject_id != entity_id and relation.object_id != entity_id:
64
+ continue
65
+ if predicate and relation.predicate != predicate:
66
+ continue
67
+ score = self.relation_score(relation)
68
+ rows.append(
69
+ {
70
+ "relation_id": relation.id,
71
+ "subject_id": relation.subject_id,
72
+ "predicate": relation.predicate,
73
+ "object_id": relation.object_id,
74
+ "score": round(score, 4),
75
+ "n_claims": len(relation.claims),
76
+ "last_seen": relation.last_seen,
77
+ }
78
+ )
79
+ rows.sort(key=lambda r: (r["score"], r["n_claims"], r["last_seen"]), reverse=True)
80
+ return rows[: max(limit, 0)]
81
+
82
+ def detect_contradictions(
83
+ self,
84
+ *,
85
+ entity_id: str | None = None,
86
+ predicate: str | None = None,
87
+ min_claims: int = 2,
88
+ ) -> list[dict[str, Any]]:
89
+ """Find relations with mixed support and contradiction evidence."""
90
+ contradictions = []
91
+ for relation in self.substrate.list_relations():
92
+ if entity_id and relation.subject_id != entity_id and relation.object_id != entity_id:
93
+ continue
94
+ if predicate and relation.predicate != predicate:
95
+ continue
96
+ if len(relation.claims) < min_claims:
97
+ continue
98
+ support = [c for c in relation.claims if c.polarity == "support"]
99
+ contradict = [c for c in relation.claims if c.polarity == "contradict"]
100
+ if not support or not contradict:
101
+ continue
102
+ contradictions.append(
103
+ {
104
+ "relation_id": relation.id,
105
+ "subject_id": relation.subject_id,
106
+ "predicate": relation.predicate,
107
+ "object_id": relation.object_id,
108
+ "support_claims": len(support),
109
+ "contradict_claims": len(contradict),
110
+ "support_score": round(self._avg_claim_score(support), 4),
111
+ "contradict_score": round(self._avg_claim_score(contradict), 4),
112
+ "last_seen": relation.last_seen,
113
+ }
114
+ )
115
+ contradictions.sort(
116
+ key=lambda c: (
117
+ min(c["support_claims"], c["contradict_claims"]),
118
+ max(c["support_score"], c["contradict_score"]),
119
+ c["last_seen"],
120
+ ),
121
+ reverse=True,
122
+ )
123
+ return contradictions
124
+
125
+ @staticmethod
126
+ def _avg_claim_score(claims: list[KBClaim]) -> float:
127
+ if not claims:
128
+ return 0.0
129
+ return sum(c.score for c in claims) / len(claims)