celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/api/app.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FastAPI application for the ct Data Query API.
|
|
3
|
+
|
|
4
|
+
Serves filtered queries against large datasets (PerturbAtlas, ChEMBL, etc.)
|
|
5
|
+
using DuckDB for efficient Parquet/CSV.gz querying.
|
|
6
|
+
|
|
7
|
+
Run with: uvicorn ct.api.app:app --host 0.0.0.0 --port 8000
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from fastapi import FastAPI, HTTPException
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from ct.api.config import discover_datasets, validate_schema, DATASET_REGISTRY, DEFAULT_QUERY_LIMIT
|
|
14
|
+
from ct.api.engine import QueryEngine
|
|
15
|
+
|
|
16
|
+
app = FastAPI(
|
|
17
|
+
title="ct Data API",
|
|
18
|
+
description="Query large drug discovery datasets via DuckDB",
|
|
19
|
+
version="0.1.0",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
engine = QueryEngine()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class QueryRequest(BaseModel):
|
|
26
|
+
dataset: str = Field(..., description="Dataset name (e.g. 'perturbatlas', 'chembl')")
|
|
27
|
+
gene: str | None = Field(None, description="Filter by gene symbol")
|
|
28
|
+
compound: str | None = Field(None, description="Filter by compound name/ID")
|
|
29
|
+
filters: dict | None = Field(None, description="Additional column filters")
|
|
30
|
+
columns: list[str] | None = Field(None, description="Columns to return")
|
|
31
|
+
limit: int = Field(DEFAULT_QUERY_LIMIT, ge=1, le=10000, description="Max rows")
|
|
32
|
+
order_by: str | None = Field(None, description="Column to sort by")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class QueryResponse(BaseModel):
|
|
36
|
+
dataset: str
|
|
37
|
+
total_rows: int
|
|
38
|
+
data: list[dict]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@app.get("/health")
|
|
42
|
+
def health():
|
|
43
|
+
"""Health check — also reports available datasets."""
|
|
44
|
+
available = discover_datasets()
|
|
45
|
+
return {
|
|
46
|
+
"status": "ok",
|
|
47
|
+
"datasets": list(available.keys()),
|
|
48
|
+
"n_datasets": len(available),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@app.get("/health/schema")
|
|
53
|
+
def schema_health():
|
|
54
|
+
"""Deep health check — validates file schemas match expectations."""
|
|
55
|
+
results = validate_schema(engine)
|
|
56
|
+
has_invalid = any(r["status"] == "invalid" for r in results.values())
|
|
57
|
+
|
|
58
|
+
status_code = 503 if has_invalid else 200
|
|
59
|
+
from fastapi.responses import JSONResponse
|
|
60
|
+
return JSONResponse(
|
|
61
|
+
status_code=status_code,
|
|
62
|
+
content={
|
|
63
|
+
"status": "unhealthy" if has_invalid else "ok",
|
|
64
|
+
"datasets": results,
|
|
65
|
+
},
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.get("/datasets")
|
|
70
|
+
def list_datasets():
|
|
71
|
+
"""List available datasets with metadata."""
|
|
72
|
+
available = discover_datasets()
|
|
73
|
+
result = []
|
|
74
|
+
for name, info in available.items():
|
|
75
|
+
registry_entry = DATASET_REGISTRY.get(name, {})
|
|
76
|
+
result.append({
|
|
77
|
+
"name": name,
|
|
78
|
+
"description": info["description"],
|
|
79
|
+
"format": info["format"],
|
|
80
|
+
"n_files": info["n_files"],
|
|
81
|
+
"total_size_mb": info["total_size_mb"],
|
|
82
|
+
"filterable_columns": info.get("filterable", []),
|
|
83
|
+
"columns": registry_entry.get("columns", []),
|
|
84
|
+
"required_columns": registry_entry.get("required_columns", []),
|
|
85
|
+
})
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@app.post("/query", response_model=QueryResponse)
|
|
90
|
+
def query_dataset(req: QueryRequest):
|
|
91
|
+
"""Run a filtered query against a dataset."""
|
|
92
|
+
if req.dataset not in DATASET_REGISTRY:
|
|
93
|
+
raise HTTPException(status_code=404, detail=f"Unknown dataset: {req.dataset}")
|
|
94
|
+
|
|
95
|
+
ds_config = DATASET_REGISTRY[req.dataset]
|
|
96
|
+
|
|
97
|
+
# Build filters
|
|
98
|
+
filters = dict(req.filters) if req.filters else {}
|
|
99
|
+
if req.gene:
|
|
100
|
+
filters["gene"] = req.gene
|
|
101
|
+
if req.compound:
|
|
102
|
+
# Try common column names for compound
|
|
103
|
+
for col in ["compound", "molecule_chembl_id", "pert_name", "compound_name"]:
|
|
104
|
+
if col in ds_config.get("filterable", []):
|
|
105
|
+
filters[col] = req.compound
|
|
106
|
+
break
|
|
107
|
+
else:
|
|
108
|
+
filters["compound"] = req.compound
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
data = engine.query_parquet(
|
|
112
|
+
file_pattern=ds_config["path_pattern"],
|
|
113
|
+
filters=filters if filters else None,
|
|
114
|
+
columns=req.columns,
|
|
115
|
+
limit=req.limit,
|
|
116
|
+
order_by=req.order_by,
|
|
117
|
+
)
|
|
118
|
+
return QueryResponse(
|
|
119
|
+
dataset=req.dataset,
|
|
120
|
+
total_rows=len(data),
|
|
121
|
+
data=data,
|
|
122
|
+
)
|
|
123
|
+
except FileNotFoundError:
|
|
124
|
+
raise HTTPException(
|
|
125
|
+
status_code=503,
|
|
126
|
+
detail=f"Dataset '{req.dataset}' files not found on disk",
|
|
127
|
+
)
|
|
128
|
+
except ValueError as e:
|
|
129
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@app.get("/datasets/{name}/gene/{gene}")
|
|
133
|
+
def gene_summary(name: str, gene: str):
|
|
134
|
+
"""Get a gene-level summary from a dataset."""
|
|
135
|
+
if name not in DATASET_REGISTRY:
|
|
136
|
+
raise HTTPException(status_code=404, detail=f"Unknown dataset: {name}")
|
|
137
|
+
|
|
138
|
+
ds_config = DATASET_REGISTRY[name]
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
data = engine.query_parquet(
|
|
142
|
+
file_pattern=ds_config["path_pattern"],
|
|
143
|
+
filters={"gene": gene},
|
|
144
|
+
limit=1000,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if not data:
|
|
148
|
+
raise HTTPException(status_code=404, detail=f"Gene {gene} not found in {name}")
|
|
149
|
+
|
|
150
|
+
# Compute summary stats
|
|
151
|
+
import statistics
|
|
152
|
+
effects = [r.get("log2FoldChange", r.get("lfc", 0)) for r in data if r.get("log2FoldChange") or r.get("lfc")]
|
|
153
|
+
mean_effect = statistics.mean(effects) if effects else 0
|
|
154
|
+
n_perturbations = len(data)
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
"gene": gene,
|
|
158
|
+
"dataset": name,
|
|
159
|
+
"n_perturbations": n_perturbations,
|
|
160
|
+
"mean_effect": round(mean_effect, 4),
|
|
161
|
+
"n_significant": sum(
|
|
162
|
+
1 for r in data
|
|
163
|
+
if (r.get("padj") or r.get("pvalue", 1)) < 0.05
|
|
164
|
+
),
|
|
165
|
+
"sample_data": data[:10],
|
|
166
|
+
}
|
|
167
|
+
except FileNotFoundError:
|
|
168
|
+
raise HTTPException(status_code=503, detail=f"Dataset '{name}' files not found")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@app.get("/datasets/{name}/compound/{compound}")
|
|
172
|
+
def compound_summary(name: str, compound: str):
|
|
173
|
+
"""Get a compound-level summary from a dataset."""
|
|
174
|
+
if name not in DATASET_REGISTRY:
|
|
175
|
+
raise HTTPException(status_code=404, detail=f"Unknown dataset: {name}")
|
|
176
|
+
|
|
177
|
+
ds_config = DATASET_REGISTRY[name]
|
|
178
|
+
|
|
179
|
+
# Try common compound column names — use filterable from dataset config
|
|
180
|
+
filters = {}
|
|
181
|
+
filterable = ds_config.get("filterable", [])
|
|
182
|
+
for col in ["compound", "molecule_chembl_id", "pert_name", "compound_name"]:
|
|
183
|
+
if col in filterable:
|
|
184
|
+
filters[col] = compound
|
|
185
|
+
break
|
|
186
|
+
else:
|
|
187
|
+
filters["compound"] = compound
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
data = engine.query_parquet(
|
|
191
|
+
file_pattern=ds_config["path_pattern"],
|
|
192
|
+
filters=filters,
|
|
193
|
+
limit=1000,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
if not data:
|
|
197
|
+
raise HTTPException(
|
|
198
|
+
status_code=404,
|
|
199
|
+
detail=f"Compound {compound} not found in {name}",
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"compound": compound,
|
|
204
|
+
"dataset": name,
|
|
205
|
+
"n_records": len(data),
|
|
206
|
+
"sample_data": data[:10],
|
|
207
|
+
}
|
|
208
|
+
except FileNotFoundError:
|
|
209
|
+
raise HTTPException(status_code=503, detail=f"Dataset '{name}' files not found")
|
|
210
|
+
except ValueError as e:
|
|
211
|
+
raise HTTPException(status_code=400, detail=str(e))
|
ct/api/config.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration for the ct Data Query API.
|
|
3
|
+
|
|
4
|
+
Maps dataset names to file paths and schemas. The data root is configurable
|
|
5
|
+
via CT_DATA_ROOT environment variable or defaults to /data (for Docker).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
DATA_ROOT = Path(os.environ.get("CT_DATA_ROOT", "/data"))
|
|
17
|
+
|
|
18
|
+
# Dataset registry: name → config
|
|
19
|
+
DATASET_REGISTRY = {
|
|
20
|
+
"perturbatlas": {
|
|
21
|
+
"description": "PerturbAtlas: differential expression from 2,066 perturbation experiments",
|
|
22
|
+
"path_pattern": "perturbatlas/Homo sapiens/*/degs.csv.gz",
|
|
23
|
+
"format": "csv.gz",
|
|
24
|
+
"columns": ["column0", "perturb_id", "gene", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj"],
|
|
25
|
+
"required_columns": ["gene", "log2FoldChange", "pvalue", "padj"],
|
|
26
|
+
"filterable": ["gene", "perturb_id"],
|
|
27
|
+
},
|
|
28
|
+
"chembl": {
|
|
29
|
+
"description": "ChEMBL v36: bioactivity data for drug-like compounds",
|
|
30
|
+
"path_pattern": "chembl/chembl/36/*.parquet",
|
|
31
|
+
"format": "parquet",
|
|
32
|
+
"columns": [],
|
|
33
|
+
"required_columns": ["molecule_chembl_id", "target_chembl_id"],
|
|
34
|
+
"filterable": ["molecule_chembl_id", "target_chembl_id", "pchembl_value"],
|
|
35
|
+
},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Maximum rows returned per query
|
|
39
|
+
MAX_QUERY_ROWS = 10000
|
|
40
|
+
DEFAULT_QUERY_LIMIT = 100
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def discover_datasets() -> dict:
|
|
44
|
+
"""Discover which datasets are actually available on disk."""
|
|
45
|
+
available = {}
|
|
46
|
+
for name, config in DATASET_REGISTRY.items():
|
|
47
|
+
pattern = config["path_pattern"]
|
|
48
|
+
# Check if any files match the pattern
|
|
49
|
+
parts = pattern.split("*")
|
|
50
|
+
base_dir = DATA_ROOT / parts[0].rstrip("/")
|
|
51
|
+
if base_dir.exists():
|
|
52
|
+
# Glob for matching files
|
|
53
|
+
files = list(DATA_ROOT.glob(pattern))
|
|
54
|
+
if files:
|
|
55
|
+
available[name] = {
|
|
56
|
+
**config,
|
|
57
|
+
"n_files": len(files),
|
|
58
|
+
"total_size_mb": round(sum(f.stat().st_size for f in files) / 1e6, 1),
|
|
59
|
+
}
|
|
60
|
+
return available
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def validate_schema(engine=None) -> dict[str, dict]:
|
|
64
|
+
"""Validate that on-disk files match expected schemas.
|
|
65
|
+
|
|
66
|
+
Samples one file per dataset, reads column names via DuckDB,
|
|
67
|
+
and compares against DATASET_REGISTRY declarations.
|
|
68
|
+
Returns {dataset_name: {status, expected, actual, missing, extra}}.
|
|
69
|
+
"""
|
|
70
|
+
if engine is None:
|
|
71
|
+
from ct.api.engine import QueryEngine
|
|
72
|
+
engine = QueryEngine()
|
|
73
|
+
|
|
74
|
+
results = {}
|
|
75
|
+
for name, config in DATASET_REGISTRY.items():
|
|
76
|
+
pattern = config["path_pattern"]
|
|
77
|
+
files = list(DATA_ROOT.glob(pattern))
|
|
78
|
+
|
|
79
|
+
if not files:
|
|
80
|
+
results[name] = {
|
|
81
|
+
"status": "unavailable",
|
|
82
|
+
"message": "No files found on disk",
|
|
83
|
+
}
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
actual_cols = engine.sample_columns(pattern)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
results[name] = {
|
|
90
|
+
"status": "error",
|
|
91
|
+
"message": f"Failed to read columns: {e}",
|
|
92
|
+
}
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
expected = set(config.get("columns", []))
|
|
96
|
+
required = set(config.get("required_columns", []))
|
|
97
|
+
actual = set(actual_cols)
|
|
98
|
+
|
|
99
|
+
missing_required = required - actual
|
|
100
|
+
missing_declared = expected - actual if expected else set()
|
|
101
|
+
extra = actual - expected if expected else set()
|
|
102
|
+
|
|
103
|
+
if missing_required:
|
|
104
|
+
status = "invalid"
|
|
105
|
+
elif missing_declared:
|
|
106
|
+
status = "warning"
|
|
107
|
+
else:
|
|
108
|
+
status = "valid"
|
|
109
|
+
|
|
110
|
+
results[name] = {
|
|
111
|
+
"status": status,
|
|
112
|
+
"actual_columns": sorted(actual),
|
|
113
|
+
"expected_columns": sorted(expected) if expected else None,
|
|
114
|
+
"required_columns": sorted(required),
|
|
115
|
+
"missing_required": sorted(missing_required) if missing_required else [],
|
|
116
|
+
"missing_declared": sorted(missing_declared) if missing_declared else [],
|
|
117
|
+
"extra_columns": sorted(extra) if extra else [],
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return results
|
ct/api/engine.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DuckDB query engine for the ct Data API.
|
|
3
|
+
|
|
4
|
+
Queries Parquet and CSV.gz files directly without loading into memory.
|
|
5
|
+
Supports predicate pushdown for efficient filtering of large datasets.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import duckdb
|
|
11
|
+
|
|
12
|
+
from ct.api.config import DATA_ROOT, MAX_QUERY_ROWS, DEFAULT_QUERY_LIMIT
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class QueryEngine:
|
|
16
|
+
"""DuckDB-based query engine for tabular data files."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, data_root: Path = None):
|
|
19
|
+
self.data_root = data_root or DATA_ROOT
|
|
20
|
+
self.conn = duckdb.connect(":memory:")
|
|
21
|
+
|
|
22
|
+
def query_parquet(self, file_pattern: str, filters: dict = None,
|
|
23
|
+
columns: list = None, limit: int = None,
|
|
24
|
+
order_by: str = None) -> list[dict]:
|
|
25
|
+
"""Query Parquet or CSV.gz files with optional filtering.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
file_pattern: Glob pattern relative to data_root (e.g. "chembl/36/*.parquet")
|
|
29
|
+
filters: Dict of column_name → value for WHERE clauses
|
|
30
|
+
columns: List of columns to SELECT (default: all)
|
|
31
|
+
limit: Max rows to return
|
|
32
|
+
order_by: Column to ORDER BY
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
List of dicts, one per row.
|
|
36
|
+
"""
|
|
37
|
+
full_pattern = str(self.data_root / file_pattern)
|
|
38
|
+
limit = min(limit or DEFAULT_QUERY_LIMIT, MAX_QUERY_ROWS)
|
|
39
|
+
|
|
40
|
+
# Build SELECT clause
|
|
41
|
+
select_cols = ", ".join(columns) if columns else "*"
|
|
42
|
+
|
|
43
|
+
# Determine read function based on file extension
|
|
44
|
+
if ".parquet" in file_pattern:
|
|
45
|
+
source = f"read_parquet('{full_pattern}', union_by_name=true)"
|
|
46
|
+
elif ".csv.gz" in file_pattern or ".csv" in file_pattern:
|
|
47
|
+
source = f"read_csv_auto('{full_pattern}', union_by_name=true, ignore_errors=true)"
|
|
48
|
+
else:
|
|
49
|
+
source = f"read_parquet('{full_pattern}', union_by_name=true)"
|
|
50
|
+
|
|
51
|
+
# Build WHERE clause
|
|
52
|
+
where_parts = []
|
|
53
|
+
params = []
|
|
54
|
+
if filters:
|
|
55
|
+
for col, val in filters.items():
|
|
56
|
+
if isinstance(val, list):
|
|
57
|
+
placeholders = ", ".join(["?" for _ in val])
|
|
58
|
+
where_parts.append(f'"{col}" IN ({placeholders})')
|
|
59
|
+
params.extend(val)
|
|
60
|
+
elif isinstance(val, str) and "%" in val:
|
|
61
|
+
where_parts.append(f'"{col}" LIKE ?')
|
|
62
|
+
params.append(val)
|
|
63
|
+
else:
|
|
64
|
+
where_parts.append(f'"{col}" = ?')
|
|
65
|
+
params.append(val)
|
|
66
|
+
|
|
67
|
+
where_clause = f" WHERE {' AND '.join(where_parts)}" if where_parts else ""
|
|
68
|
+
order_clause = f' ORDER BY "{order_by}"' if order_by else ""
|
|
69
|
+
limit_clause = f" LIMIT {limit}"
|
|
70
|
+
|
|
71
|
+
query = f"SELECT {select_cols} FROM {source}{where_clause}{order_clause}{limit_clause}"
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
result = self.conn.execute(query, params).fetchdf()
|
|
75
|
+
return result.to_dict("records")
|
|
76
|
+
except duckdb.IOException as e:
|
|
77
|
+
raise FileNotFoundError(f"Data files not found: {full_pattern}") from e
|
|
78
|
+
except (duckdb.CatalogException, duckdb.BinderException) as e:
|
|
79
|
+
raise ValueError(f"Query error (bad column?): {e}") from e
|
|
80
|
+
|
|
81
|
+
def sample_columns(self, file_pattern: str) -> list[str]:
|
|
82
|
+
"""Read column names from first matching file without loading data."""
|
|
83
|
+
full_pattern = str(self.data_root / file_pattern)
|
|
84
|
+
|
|
85
|
+
if ".parquet" in file_pattern:
|
|
86
|
+
source = f"read_parquet('{full_pattern}', union_by_name=true)"
|
|
87
|
+
elif ".csv.gz" in file_pattern or ".csv" in file_pattern:
|
|
88
|
+
source = f"read_csv_auto('{full_pattern}', union_by_name=true, ignore_errors=true)"
|
|
89
|
+
else:
|
|
90
|
+
source = f"read_parquet('{full_pattern}', union_by_name=true)"
|
|
91
|
+
|
|
92
|
+
query = f"SELECT * FROM {source} LIMIT 0"
|
|
93
|
+
result = self.conn.execute(query)
|
|
94
|
+
return [desc[0] for desc in result.description]
|
|
95
|
+
|
|
96
|
+
def count(self, file_pattern: str, filters: dict = None) -> int:
|
|
97
|
+
"""Count rows matching filters."""
|
|
98
|
+
full_pattern = str(self.data_root / file_pattern)
|
|
99
|
+
|
|
100
|
+
if ".parquet" in file_pattern:
|
|
101
|
+
source = f"read_parquet('{full_pattern}', union_by_name=true)"
|
|
102
|
+
else:
|
|
103
|
+
source = f"read_csv_auto('{full_pattern}', union_by_name=true)"
|
|
104
|
+
|
|
105
|
+
where_parts = []
|
|
106
|
+
params = []
|
|
107
|
+
if filters:
|
|
108
|
+
for col, val in filters.items():
|
|
109
|
+
if isinstance(val, list):
|
|
110
|
+
placeholders = ", ".join(["?" for _ in val])
|
|
111
|
+
where_parts.append(f'"{col}" IN ({placeholders})')
|
|
112
|
+
params.extend(val)
|
|
113
|
+
else:
|
|
114
|
+
where_parts.append(f'"{col}" = ?')
|
|
115
|
+
params.append(val)
|
|
116
|
+
|
|
117
|
+
where_clause = f" WHERE {' AND '.join(where_parts)}" if where_parts else ""
|
|
118
|
+
query = f"SELECT COUNT(*) as cnt FROM {source}{where_clause}"
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
result = self.conn.execute(query, params).fetchone()
|
|
122
|
+
return result[0] if result else 0
|
|
123
|
+
except duckdb.IOException:
|
|
124
|
+
return 0
|