celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/api/app.py ADDED
@@ -0,0 +1,211 @@
1
+ """
2
+ FastAPI application for the ct Data Query API.
3
+
4
+ Serves filtered queries against large datasets (PerturbAtlas, ChEMBL, etc.)
5
+ using DuckDB for efficient Parquet/CSV.gz querying.
6
+
7
+ Run with: uvicorn ct.api.app:app --host 0.0.0.0 --port 8000
8
+ """
9
+
10
+ from fastapi import FastAPI, HTTPException
11
+ from pydantic import BaseModel, Field
12
+
13
+ from ct.api.config import discover_datasets, validate_schema, DATASET_REGISTRY, DEFAULT_QUERY_LIMIT
14
+ from ct.api.engine import QueryEngine
15
+
16
+ app = FastAPI(
17
+ title="ct Data API",
18
+ description="Query large drug discovery datasets via DuckDB",
19
+ version="0.1.0",
20
+ )
21
+
22
+ engine = QueryEngine()
23
+
24
+
25
+ class QueryRequest(BaseModel):
26
+ dataset: str = Field(..., description="Dataset name (e.g. 'perturbatlas', 'chembl')")
27
+ gene: str | None = Field(None, description="Filter by gene symbol")
28
+ compound: str | None = Field(None, description="Filter by compound name/ID")
29
+ filters: dict | None = Field(None, description="Additional column filters")
30
+ columns: list[str] | None = Field(None, description="Columns to return")
31
+ limit: int = Field(DEFAULT_QUERY_LIMIT, ge=1, le=10000, description="Max rows")
32
+ order_by: str | None = Field(None, description="Column to sort by")
33
+
34
+
35
+ class QueryResponse(BaseModel):
36
+ dataset: str
37
+ total_rows: int
38
+ data: list[dict]
39
+
40
+
41
+ @app.get("/health")
42
+ def health():
43
+ """Health check — also reports available datasets."""
44
+ available = discover_datasets()
45
+ return {
46
+ "status": "ok",
47
+ "datasets": list(available.keys()),
48
+ "n_datasets": len(available),
49
+ }
50
+
51
+
52
+ @app.get("/health/schema")
53
+ def schema_health():
54
+ """Deep health check — validates file schemas match expectations."""
55
+ results = validate_schema(engine)
56
+ has_invalid = any(r["status"] == "invalid" for r in results.values())
57
+
58
+ status_code = 503 if has_invalid else 200
59
+ from fastapi.responses import JSONResponse
60
+ return JSONResponse(
61
+ status_code=status_code,
62
+ content={
63
+ "status": "unhealthy" if has_invalid else "ok",
64
+ "datasets": results,
65
+ },
66
+ )
67
+
68
+
69
+ @app.get("/datasets")
70
+ def list_datasets():
71
+ """List available datasets with metadata."""
72
+ available = discover_datasets()
73
+ result = []
74
+ for name, info in available.items():
75
+ registry_entry = DATASET_REGISTRY.get(name, {})
76
+ result.append({
77
+ "name": name,
78
+ "description": info["description"],
79
+ "format": info["format"],
80
+ "n_files": info["n_files"],
81
+ "total_size_mb": info["total_size_mb"],
82
+ "filterable_columns": info.get("filterable", []),
83
+ "columns": registry_entry.get("columns", []),
84
+ "required_columns": registry_entry.get("required_columns", []),
85
+ })
86
+ return result
87
+
88
+
89
+ @app.post("/query", response_model=QueryResponse)
90
+ def query_dataset(req: QueryRequest):
91
+ """Run a filtered query against a dataset."""
92
+ if req.dataset not in DATASET_REGISTRY:
93
+ raise HTTPException(status_code=404, detail=f"Unknown dataset: {req.dataset}")
94
+
95
+ ds_config = DATASET_REGISTRY[req.dataset]
96
+
97
+ # Build filters
98
+ filters = dict(req.filters) if req.filters else {}
99
+ if req.gene:
100
+ filters["gene"] = req.gene
101
+ if req.compound:
102
+ # Try common column names for compound
103
+ for col in ["compound", "molecule_chembl_id", "pert_name", "compound_name"]:
104
+ if col in ds_config.get("filterable", []):
105
+ filters[col] = req.compound
106
+ break
107
+ else:
108
+ filters["compound"] = req.compound
109
+
110
+ try:
111
+ data = engine.query_parquet(
112
+ file_pattern=ds_config["path_pattern"],
113
+ filters=filters if filters else None,
114
+ columns=req.columns,
115
+ limit=req.limit,
116
+ order_by=req.order_by,
117
+ )
118
+ return QueryResponse(
119
+ dataset=req.dataset,
120
+ total_rows=len(data),
121
+ data=data,
122
+ )
123
+ except FileNotFoundError:
124
+ raise HTTPException(
125
+ status_code=503,
126
+ detail=f"Dataset '{req.dataset}' files not found on disk",
127
+ )
128
+ except ValueError as e:
129
+ raise HTTPException(status_code=400, detail=str(e))
130
+
131
+
132
+ @app.get("/datasets/{name}/gene/{gene}")
133
+ def gene_summary(name: str, gene: str):
134
+ """Get a gene-level summary from a dataset."""
135
+ if name not in DATASET_REGISTRY:
136
+ raise HTTPException(status_code=404, detail=f"Unknown dataset: {name}")
137
+
138
+ ds_config = DATASET_REGISTRY[name]
139
+
140
+ try:
141
+ data = engine.query_parquet(
142
+ file_pattern=ds_config["path_pattern"],
143
+ filters={"gene": gene},
144
+ limit=1000,
145
+ )
146
+
147
+ if not data:
148
+ raise HTTPException(status_code=404, detail=f"Gene {gene} not found in {name}")
149
+
150
+ # Compute summary stats
151
+ import statistics
152
+ effects = [r.get("log2FoldChange", r.get("lfc", 0)) for r in data if r.get("log2FoldChange") or r.get("lfc")]
153
+ mean_effect = statistics.mean(effects) if effects else 0
154
+ n_perturbations = len(data)
155
+
156
+ return {
157
+ "gene": gene,
158
+ "dataset": name,
159
+ "n_perturbations": n_perturbations,
160
+ "mean_effect": round(mean_effect, 4),
161
+ "n_significant": sum(
162
+ 1 for r in data
163
+ if (r.get("padj") or r.get("pvalue", 1)) < 0.05
164
+ ),
165
+ "sample_data": data[:10],
166
+ }
167
+ except FileNotFoundError:
168
+ raise HTTPException(status_code=503, detail=f"Dataset '{name}' files not found")
169
+
170
+
171
+ @app.get("/datasets/{name}/compound/{compound}")
172
+ def compound_summary(name: str, compound: str):
173
+ """Get a compound-level summary from a dataset."""
174
+ if name not in DATASET_REGISTRY:
175
+ raise HTTPException(status_code=404, detail=f"Unknown dataset: {name}")
176
+
177
+ ds_config = DATASET_REGISTRY[name]
178
+
179
+ # Try common compound column names — use filterable from dataset config
180
+ filters = {}
181
+ filterable = ds_config.get("filterable", [])
182
+ for col in ["compound", "molecule_chembl_id", "pert_name", "compound_name"]:
183
+ if col in filterable:
184
+ filters[col] = compound
185
+ break
186
+ else:
187
+ filters["compound"] = compound
188
+
189
+ try:
190
+ data = engine.query_parquet(
191
+ file_pattern=ds_config["path_pattern"],
192
+ filters=filters,
193
+ limit=1000,
194
+ )
195
+
196
+ if not data:
197
+ raise HTTPException(
198
+ status_code=404,
199
+ detail=f"Compound {compound} not found in {name}",
200
+ )
201
+
202
+ return {
203
+ "compound": compound,
204
+ "dataset": name,
205
+ "n_records": len(data),
206
+ "sample_data": data[:10],
207
+ }
208
+ except FileNotFoundError:
209
+ raise HTTPException(status_code=503, detail=f"Dataset '{name}' files not found")
210
+ except ValueError as e:
211
+ raise HTTPException(status_code=400, detail=str(e))
ct/api/config.py ADDED
@@ -0,0 +1,120 @@
1
+ """
2
+ Configuration for the ct Data Query API.
3
+
4
+ Maps dataset names to file paths and schemas. The data root is configurable
5
+ via CT_DATA_ROOT environment variable or defaults to /data (for Docker).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ from pathlib import Path
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ DATA_ROOT = Path(os.environ.get("CT_DATA_ROOT", "/data"))
17
+
18
+ # Dataset registry: name → config
19
+ DATASET_REGISTRY = {
20
+ "perturbatlas": {
21
+ "description": "PerturbAtlas: differential expression from 2,066 perturbation experiments",
22
+ "path_pattern": "perturbatlas/Homo sapiens/*/degs.csv.gz",
23
+ "format": "csv.gz",
24
+ "columns": ["column0", "perturb_id", "gene", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj"],
25
+ "required_columns": ["gene", "log2FoldChange", "pvalue", "padj"],
26
+ "filterable": ["gene", "perturb_id"],
27
+ },
28
+ "chembl": {
29
+ "description": "ChEMBL v36: bioactivity data for drug-like compounds",
30
+ "path_pattern": "chembl/chembl/36/*.parquet",
31
+ "format": "parquet",
32
+ "columns": [],
33
+ "required_columns": ["molecule_chembl_id", "target_chembl_id"],
34
+ "filterable": ["molecule_chembl_id", "target_chembl_id", "pchembl_value"],
35
+ },
36
+ }
37
+
38
+ # Maximum rows returned per query
39
+ MAX_QUERY_ROWS = 10000
40
+ DEFAULT_QUERY_LIMIT = 100
41
+
42
+
43
+ def discover_datasets() -> dict:
44
+ """Discover which datasets are actually available on disk."""
45
+ available = {}
46
+ for name, config in DATASET_REGISTRY.items():
47
+ pattern = config["path_pattern"]
48
+ # Check if any files match the pattern
49
+ parts = pattern.split("*")
50
+ base_dir = DATA_ROOT / parts[0].rstrip("/")
51
+ if base_dir.exists():
52
+ # Glob for matching files
53
+ files = list(DATA_ROOT.glob(pattern))
54
+ if files:
55
+ available[name] = {
56
+ **config,
57
+ "n_files": len(files),
58
+ "total_size_mb": round(sum(f.stat().st_size for f in files) / 1e6, 1),
59
+ }
60
+ return available
61
+
62
+
63
+ def validate_schema(engine=None) -> dict[str, dict]:
64
+ """Validate that on-disk files match expected schemas.
65
+
66
+ Samples one file per dataset, reads column names via DuckDB,
67
+ and compares against DATASET_REGISTRY declarations.
68
+ Returns {dataset_name: {status, expected, actual, missing, extra}}.
69
+ """
70
+ if engine is None:
71
+ from ct.api.engine import QueryEngine
72
+ engine = QueryEngine()
73
+
74
+ results = {}
75
+ for name, config in DATASET_REGISTRY.items():
76
+ pattern = config["path_pattern"]
77
+ files = list(DATA_ROOT.glob(pattern))
78
+
79
+ if not files:
80
+ results[name] = {
81
+ "status": "unavailable",
82
+ "message": "No files found on disk",
83
+ }
84
+ continue
85
+
86
+ try:
87
+ actual_cols = engine.sample_columns(pattern)
88
+ except Exception as e:
89
+ results[name] = {
90
+ "status": "error",
91
+ "message": f"Failed to read columns: {e}",
92
+ }
93
+ continue
94
+
95
+ expected = set(config.get("columns", []))
96
+ required = set(config.get("required_columns", []))
97
+ actual = set(actual_cols)
98
+
99
+ missing_required = required - actual
100
+ missing_declared = expected - actual if expected else set()
101
+ extra = actual - expected if expected else set()
102
+
103
+ if missing_required:
104
+ status = "invalid"
105
+ elif missing_declared:
106
+ status = "warning"
107
+ else:
108
+ status = "valid"
109
+
110
+ results[name] = {
111
+ "status": status,
112
+ "actual_columns": sorted(actual),
113
+ "expected_columns": sorted(expected) if expected else None,
114
+ "required_columns": sorted(required),
115
+ "missing_required": sorted(missing_required) if missing_required else [],
116
+ "missing_declared": sorted(missing_declared) if missing_declared else [],
117
+ "extra_columns": sorted(extra) if extra else [],
118
+ }
119
+
120
+ return results
ct/api/engine.py ADDED
@@ -0,0 +1,124 @@
1
+ """
2
+ DuckDB query engine for the ct Data API.
3
+
4
+ Queries Parquet and CSV.gz files directly without loading into memory.
5
+ Supports predicate pushdown for efficient filtering of large datasets.
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+ import duckdb
11
+
12
+ from ct.api.config import DATA_ROOT, MAX_QUERY_ROWS, DEFAULT_QUERY_LIMIT
13
+
14
+
15
+ class QueryEngine:
16
+ """DuckDB-based query engine for tabular data files."""
17
+
18
+ def __init__(self, data_root: Path = None):
19
+ self.data_root = data_root or DATA_ROOT
20
+ self.conn = duckdb.connect(":memory:")
21
+
22
+ def query_parquet(self, file_pattern: str, filters: dict = None,
23
+ columns: list = None, limit: int = None,
24
+ order_by: str = None) -> list[dict]:
25
+ """Query Parquet or CSV.gz files with optional filtering.
26
+
27
+ Args:
28
+ file_pattern: Glob pattern relative to data_root (e.g. "chembl/36/*.parquet")
29
+ filters: Dict of column_name → value for WHERE clauses
30
+ columns: List of columns to SELECT (default: all)
31
+ limit: Max rows to return
32
+ order_by: Column to ORDER BY
33
+
34
+ Returns:
35
+ List of dicts, one per row.
36
+ """
37
+ full_pattern = str(self.data_root / file_pattern)
38
+ limit = min(limit or DEFAULT_QUERY_LIMIT, MAX_QUERY_ROWS)
39
+
40
+ # Build SELECT clause
41
+ select_cols = ", ".join(columns) if columns else "*"
42
+
43
+ # Determine read function based on file extension
44
+ if ".parquet" in file_pattern:
45
+ source = f"read_parquet('{full_pattern}', union_by_name=true)"
46
+ elif ".csv.gz" in file_pattern or ".csv" in file_pattern:
47
+ source = f"read_csv_auto('{full_pattern}', union_by_name=true, ignore_errors=true)"
48
+ else:
49
+ source = f"read_parquet('{full_pattern}', union_by_name=true)"
50
+
51
+ # Build WHERE clause
52
+ where_parts = []
53
+ params = []
54
+ if filters:
55
+ for col, val in filters.items():
56
+ if isinstance(val, list):
57
+ placeholders = ", ".join(["?" for _ in val])
58
+ where_parts.append(f'"{col}" IN ({placeholders})')
59
+ params.extend(val)
60
+ elif isinstance(val, str) and "%" in val:
61
+ where_parts.append(f'"{col}" LIKE ?')
62
+ params.append(val)
63
+ else:
64
+ where_parts.append(f'"{col}" = ?')
65
+ params.append(val)
66
+
67
+ where_clause = f" WHERE {' AND '.join(where_parts)}" if where_parts else ""
68
+ order_clause = f' ORDER BY "{order_by}"' if order_by else ""
69
+ limit_clause = f" LIMIT {limit}"
70
+
71
+ query = f"SELECT {select_cols} FROM {source}{where_clause}{order_clause}{limit_clause}"
72
+
73
+ try:
74
+ result = self.conn.execute(query, params).fetchdf()
75
+ return result.to_dict("records")
76
+ except duckdb.IOException as e:
77
+ raise FileNotFoundError(f"Data files not found: {full_pattern}") from e
78
+ except (duckdb.CatalogException, duckdb.BinderException) as e:
79
+ raise ValueError(f"Query error (bad column?): {e}") from e
80
+
81
+ def sample_columns(self, file_pattern: str) -> list[str]:
82
+ """Read column names from first matching file without loading data."""
83
+ full_pattern = str(self.data_root / file_pattern)
84
+
85
+ if ".parquet" in file_pattern:
86
+ source = f"read_parquet('{full_pattern}', union_by_name=true)"
87
+ elif ".csv.gz" in file_pattern or ".csv" in file_pattern:
88
+ source = f"read_csv_auto('{full_pattern}', union_by_name=true, ignore_errors=true)"
89
+ else:
90
+ source = f"read_parquet('{full_pattern}', union_by_name=true)"
91
+
92
+ query = f"SELECT * FROM {source} LIMIT 0"
93
+ result = self.conn.execute(query)
94
+ return [desc[0] for desc in result.description]
95
+
96
+ def count(self, file_pattern: str, filters: dict = None) -> int:
97
+ """Count rows matching filters."""
98
+ full_pattern = str(self.data_root / file_pattern)
99
+
100
+ if ".parquet" in file_pattern:
101
+ source = f"read_parquet('{full_pattern}', union_by_name=true)"
102
+ else:
103
+ source = f"read_csv_auto('{full_pattern}', union_by_name=true)"
104
+
105
+ where_parts = []
106
+ params = []
107
+ if filters:
108
+ for col, val in filters.items():
109
+ if isinstance(val, list):
110
+ placeholders = ", ".join(["?" for _ in val])
111
+ where_parts.append(f'"{col}" IN ({placeholders})')
112
+ params.extend(val)
113
+ else:
114
+ where_parts.append(f'"{col}" = ?')
115
+ params.append(val)
116
+
117
+ where_clause = f" WHERE {' AND '.join(where_parts)}" if where_parts else ""
118
+ query = f"SELECT COUNT(*) as cnt FROM {source}{where_clause}"
119
+
120
+ try:
121
+ result = self.conn.execute(query, params).fetchone()
122
+ return result[0] if result else 0
123
+ except duckdb.IOException:
124
+ return 0