celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/clue.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
L1000/CMap compound signature and connectivity tools.
|
|
3
|
+
|
|
4
|
+
Uses local L1000 Level 5 compound profiles (19,811 compounds × 978 landmark genes)
|
|
5
|
+
built from the Broad LINCS GSE92742 dataset. Falls back to CLUE API if configured.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from functools import lru_cache
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from ct.tools import registry
|
|
14
|
+
from ct.agent.config import Config
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_clue_key() -> str | None:
|
|
18
|
+
"""Get CLUE API key from config/environment (for API-based fallback)."""
|
|
19
|
+
import os
|
|
20
|
+
cfg = Config.load()
|
|
21
|
+
return cfg.get("clue.api_key") or os.environ.get("CLUE_API_KEY")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ── Local data paths ──
|
|
25
|
+
|
|
26
|
+
_LINCS_DIR = Path("/mnt2/bronze/lincs")
|
|
27
|
+
_PROFILES_PATH = _LINCS_DIR / "l1000_compound_profiles.parquet"
|
|
28
|
+
_METADATA_PATH = _LINCS_DIR / "l1000_pert_metadata.parquet"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@lru_cache(maxsize=1)
|
|
32
|
+
def _load_profiles() -> pd.DataFrame:
|
|
33
|
+
"""Load compound profiles (19,811 compounds × 978 landmark genes)."""
|
|
34
|
+
# Check configured path first
|
|
35
|
+
cfg = Config.load()
|
|
36
|
+
path = cfg.get("data.l1000_profiles")
|
|
37
|
+
if path:
|
|
38
|
+
p = Path(path)
|
|
39
|
+
if p.is_file():
|
|
40
|
+
return pd.read_parquet(p)
|
|
41
|
+
|
|
42
|
+
if _PROFILES_PATH.exists():
|
|
43
|
+
return pd.read_parquet(_PROFILES_PATH)
|
|
44
|
+
|
|
45
|
+
# Search in data.base
|
|
46
|
+
base = cfg.get("data.base")
|
|
47
|
+
if base:
|
|
48
|
+
candidate = Path(base) / "lincs" / "l1000_compound_profiles.parquet"
|
|
49
|
+
if candidate.exists():
|
|
50
|
+
return pd.read_parquet(candidate)
|
|
51
|
+
|
|
52
|
+
raise FileNotFoundError(
|
|
53
|
+
"L1000 compound profiles not found. "
|
|
54
|
+
"Expected at: /mnt2/bronze/lincs/l1000_compound_profiles.parquet"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@lru_cache(maxsize=1)
|
|
59
|
+
def _load_pert_metadata() -> pd.DataFrame:
|
|
60
|
+
"""Load perturbagen metadata (SMILES, PubChem CID, etc.)."""
|
|
61
|
+
if _METADATA_PATH.exists():
|
|
62
|
+
return pd.read_parquet(_METADATA_PATH)
|
|
63
|
+
return pd.DataFrame()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _find_compound(name: str, profiles: pd.DataFrame) -> str | None:
|
|
67
|
+
"""Find a compound in profiles by case-insensitive name matching."""
|
|
68
|
+
name_lower = name.lower().strip()
|
|
69
|
+
# Build lowercase index for matching
|
|
70
|
+
idx_lower = {c.lower(): c for c in profiles.index}
|
|
71
|
+
if name_lower in idx_lower:
|
|
72
|
+
return idx_lower[name_lower]
|
|
73
|
+
# Try partial match
|
|
74
|
+
for key, original in idx_lower.items():
|
|
75
|
+
if name_lower in key or key in name_lower:
|
|
76
|
+
return original
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@registry.register(
|
|
81
|
+
name="clue.compound_signature",
|
|
82
|
+
description="Get the L1000 transcriptomic signature (up/down-regulated genes) for a compound",
|
|
83
|
+
category="clue",
|
|
84
|
+
parameters={
|
|
85
|
+
"compound": "Compound name (e.g. 'vorinostat', 'lenalidomide', 'bortezomib')",
|
|
86
|
+
"top_n": "Number of top up/down genes to return (default 50)",
|
|
87
|
+
},
|
|
88
|
+
usage_guide=(
|
|
89
|
+
"You need the transcriptomic signature (up/down genes) of a compound from L1000/CMap. "
|
|
90
|
+
"Use to understand a compound's mechanism of action or as input for connectivity queries. "
|
|
91
|
+
"Covers ~19,800 compounds from the Broad LINCS dataset."
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
def compound_signature(compound: str, top_n: int = 50, **kwargs) -> dict:
|
|
95
|
+
"""Get the L1000 transcriptomic signature for a compound from local data."""
|
|
96
|
+
if not compound or not isinstance(compound, str):
|
|
97
|
+
return {
|
|
98
|
+
"error": "compound parameter required",
|
|
99
|
+
"summary": "Provide a compound name (e.g. 'lenalidomide').",
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
profiles = _load_profiles()
|
|
104
|
+
except FileNotFoundError as e:
|
|
105
|
+
return {"error": str(e), "summary": str(e)}
|
|
106
|
+
|
|
107
|
+
# Find compound in profiles
|
|
108
|
+
matched = _find_compound(compound, profiles)
|
|
109
|
+
if matched is None:
|
|
110
|
+
return {
|
|
111
|
+
"error": f"Compound '{compound}' not found in L1000 data ({len(profiles)} compounds available).",
|
|
112
|
+
"summary": f"Compound '{compound}' not found in L1000/CMap database.",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Extract profile
|
|
116
|
+
profile = profiles.loc[matched]
|
|
117
|
+
|
|
118
|
+
# Get top up and down regulated genes
|
|
119
|
+
sorted_genes = profile.sort_values(ascending=False)
|
|
120
|
+
up_genes = [
|
|
121
|
+
{"gene": g, "z_score": round(float(v), 4)}
|
|
122
|
+
for g, v in sorted_genes.head(top_n).items()
|
|
123
|
+
]
|
|
124
|
+
down_genes = [
|
|
125
|
+
{"gene": g, "z_score": round(float(v), 4)}
|
|
126
|
+
for g, v in sorted_genes.tail(top_n).iloc[::-1].items()
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
# Get metadata if available
|
|
130
|
+
meta = _load_pert_metadata()
|
|
131
|
+
pert_id = ""
|
|
132
|
+
smiles = ""
|
|
133
|
+
pubchem_cid = ""
|
|
134
|
+
if matched in meta.index:
|
|
135
|
+
row = meta.loc[matched]
|
|
136
|
+
pert_id = str(row.get("pert_id", "")) if pd.notna(row.get("pert_id")) else ""
|
|
137
|
+
smiles = str(row.get("canonical_smiles", "")) if pd.notna(row.get("canonical_smiles")) else ""
|
|
138
|
+
pubchem_cid = str(row.get("pubchem_cid", "")) if pd.notna(row.get("pubchem_cid")) else ""
|
|
139
|
+
|
|
140
|
+
summary = (
|
|
141
|
+
f"L1000 signature for {matched}: "
|
|
142
|
+
f"{top_n} up-regulated genes (top: {up_genes[0]['gene']} z={up_genes[0]['z_score']}), "
|
|
143
|
+
f"{top_n} down-regulated genes (top: {down_genes[0]['gene']} z={down_genes[0]['z_score']})"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"summary": summary,
|
|
148
|
+
"compound": matched,
|
|
149
|
+
"pert_id": pert_id,
|
|
150
|
+
"smiles": smiles,
|
|
151
|
+
"pubchem_cid": pubchem_cid,
|
|
152
|
+
"n_signatures_aggregated": len(profiles),
|
|
153
|
+
"up_genes": up_genes,
|
|
154
|
+
"down_genes": down_genes,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@registry.register(
|
|
159
|
+
name="clue.connectivity_query",
|
|
160
|
+
description="Find compounds with similar or opposing transcriptomic signatures to a gene set",
|
|
161
|
+
category="clue",
|
|
162
|
+
parameters={
|
|
163
|
+
"gene_list": "Dict with 'up' and 'down' keys, each a list of gene symbols",
|
|
164
|
+
"n_results": "Number of top results to return (default 20)",
|
|
165
|
+
},
|
|
166
|
+
usage_guide=(
|
|
167
|
+
"You have a gene signature (up/down-regulated genes) and want to find compounds "
|
|
168
|
+
"with similar or opposing transcriptomic effects. Core CMap analysis. "
|
|
169
|
+
"Use to find drug repurposing candidates or understand mechanism of action."
|
|
170
|
+
),
|
|
171
|
+
)
|
|
172
|
+
def connectivity_query(gene_list: dict = None, n_results: int = 20, **kwargs) -> dict:
|
|
173
|
+
"""Query local L1000 profiles with a gene signature using weighted connectivity scoring."""
|
|
174
|
+
if not gene_list or not isinstance(gene_list, dict):
|
|
175
|
+
return {
|
|
176
|
+
"error": "gene_list must be a dict with 'up' and 'down' keys",
|
|
177
|
+
"summary": "Invalid input: provide gene_list={'up': [...], 'down': [...]}",
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
up_genes = gene_list.get("up", [])
|
|
181
|
+
down_genes = gene_list.get("down", [])
|
|
182
|
+
|
|
183
|
+
if not up_genes and not down_genes:
|
|
184
|
+
return {
|
|
185
|
+
"error": "gene_list must have at least one gene in 'up' or 'down'",
|
|
186
|
+
"summary": "Provide at least one up- or down-regulated gene.",
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
profiles = _load_profiles()
|
|
191
|
+
except FileNotFoundError as e:
|
|
192
|
+
return {"error": str(e), "summary": str(e)}
|
|
193
|
+
|
|
194
|
+
# Find which query genes are in the profile columns
|
|
195
|
+
available_up = [g for g in up_genes if g in profiles.columns]
|
|
196
|
+
available_down = [g for g in down_genes if g in profiles.columns]
|
|
197
|
+
|
|
198
|
+
if not available_up and not available_down:
|
|
199
|
+
return {
|
|
200
|
+
"error": "None of the query genes found in L1000 landmark genes.",
|
|
201
|
+
"summary": (
|
|
202
|
+
f"0/{len(up_genes)} up genes and 0/{len(down_genes)} down genes "
|
|
203
|
+
"matched L1000 landmark genes (978 genes)."
|
|
204
|
+
),
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Compute connectivity score for each compound:
|
|
208
|
+
# score = mean(z-scores of up genes) - mean(z-scores of down genes)
|
|
209
|
+
# Positive score = compound mimics the signature
|
|
210
|
+
# Negative score = compound opposes the signature
|
|
211
|
+
score = np.zeros(len(profiles))
|
|
212
|
+
|
|
213
|
+
if available_up:
|
|
214
|
+
up_matrix = profiles[available_up].values
|
|
215
|
+
score += np.nanmean(up_matrix, axis=1)
|
|
216
|
+
if available_down:
|
|
217
|
+
down_matrix = profiles[available_down].values
|
|
218
|
+
score -= np.nanmean(down_matrix, axis=1)
|
|
219
|
+
|
|
220
|
+
# Rank compounds
|
|
221
|
+
results_df = pd.DataFrame({
|
|
222
|
+
"compound": profiles.index,
|
|
223
|
+
"connectivity_score": score,
|
|
224
|
+
}).sort_values("connectivity_score", ascending=False)
|
|
225
|
+
|
|
226
|
+
# Top similar (positive scores) and top opposing (negative scores)
|
|
227
|
+
top_similar = results_df.head(n_results).to_dict("records")
|
|
228
|
+
top_opposing = results_df.tail(n_results).iloc[::-1].to_dict("records")
|
|
229
|
+
|
|
230
|
+
for row in top_similar + top_opposing:
|
|
231
|
+
row["connectivity_score"] = round(row["connectivity_score"], 4)
|
|
232
|
+
|
|
233
|
+
top = top_similar[0] if top_similar else {}
|
|
234
|
+
summary = (
|
|
235
|
+
f"Connectivity query: {len(available_up)}/{len(up_genes)} up, "
|
|
236
|
+
f"{len(available_down)}/{len(down_genes)} down genes matched. "
|
|
237
|
+
f"Scored {len(profiles)} compounds. "
|
|
238
|
+
f"Top mimicker: {top.get('compound', 'N/A')} (score={top.get('connectivity_score', 0)}). "
|
|
239
|
+
f"Top opposer: {top_opposing[0]['compound']} (score={top_opposing[0]['connectivity_score']})"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
"summary": summary,
|
|
244
|
+
"n_up_matched": len(available_up),
|
|
245
|
+
"n_down_matched": len(available_down),
|
|
246
|
+
"n_compounds_scored": len(profiles),
|
|
247
|
+
"top_similar": top_similar,
|
|
248
|
+
"top_opposing": top_opposing,
|
|
249
|
+
}
|