scorepilot 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorepilot/__init__.py +3 -0
- scorepilot/api/__init__.py +1 -0
- scorepilot/api/datasets.py +116 -0
- scorepilot/api/deps.py +34 -0
- scorepilot/api/exploration.py +195 -0
- scorepilot/api/models.py +243 -0
- scorepilot/app.py +80 -0
- scorepilot/config.py +52 -0
- scorepilot/core/__init__.py +59 -0
- scorepilot/core/_pandas.py +27 -0
- scorepilot/core/modeling.py +135 -0
- scorepilot/core/pca.py +143 -0
- scorepilot/core/preprocessing.py +77 -0
- scorepilot/core/profiling.py +158 -0
- scorepilot/core/quality.py +187 -0
- scorepilot/core/schema.py +56 -0
- scorepilot/core/transforms.py +80 -0
- scorepilot/core/workset.py +181 -0
- scorepilot/dataset_store.py +137 -0
- scorepilot/db/__init__.py +19 -0
- scorepilot/db/models.py +58 -0
- scorepilot/db/repository.py +86 -0
- scorepilot/db/session.py +41 -0
- scorepilot/main.py +53 -0
- scorepilot/schemas.py +277 -0
- scorepilot/web/_app/immutable/assets/2.BqmOpLO_.css +1 -0
- scorepilot/web/_app/immutable/assets/3.4vnlz7rQ.css +1 -0
- scorepilot/web/_app/immutable/assets/4.ZtP4eHL5.css +1 -0
- scorepilot/web/_app/immutable/assets/5.Db_Au4kE.css +1 -0
- scorepilot/web/_app/immutable/assets/6.DzOtZxiW.css +1 -0
- scorepilot/web/_app/immutable/chunks/4R7wieXd.js +1 -0
- scorepilot/web/_app/immutable/chunks/BFv5X3rB.js +1 -0
- scorepilot/web/_app/immutable/chunks/BG11_7tn.js +1 -0
- scorepilot/web/_app/immutable/chunks/BvkNMq5N.js +2 -0
- scorepilot/web/_app/immutable/chunks/C652voOb.js +1 -0
- scorepilot/web/_app/immutable/chunks/CUWj1fbK.js +1 -0
- scorepilot/web/_app/immutable/chunks/Cxc2N8I5.js +1 -0
- scorepilot/web/_app/immutable/chunks/D29yIDqO.js +1 -0
- scorepilot/web/_app/immutable/chunks/DEBUNeTL.js +1 -0
- scorepilot/web/_app/immutable/chunks/DTnWt5ot.js +1 -0
- scorepilot/web/_app/immutable/chunks/DX4pGedH.js +2 -0
- scorepilot/web/_app/immutable/chunks/DagW9dn_.js +1 -0
- scorepilot/web/_app/immutable/chunks/DiAmW3g9.js +1 -0
- scorepilot/web/_app/immutable/chunks/OWCG57QL.js +1 -0
- scorepilot/web/_app/immutable/chunks/XL8PmYae.js +1 -0
- scorepilot/web/_app/immutable/chunks/i5N5regl.js +60 -0
- scorepilot/web/_app/immutable/entry/app.ByUizNFm.js +2 -0
- scorepilot/web/_app/immutable/entry/start.P518WGvt.js +1 -0
- scorepilot/web/_app/immutable/nodes/0.B3UdGiPr.js +1 -0
- scorepilot/web/_app/immutable/nodes/1.Bm29Ig6r.js +1 -0
- scorepilot/web/_app/immutable/nodes/2.DXWavgtc.js +1 -0
- scorepilot/web/_app/immutable/nodes/3.BFvNjpq_.js +3 -0
- scorepilot/web/_app/immutable/nodes/4.BAIJftHt.js +1 -0
- scorepilot/web/_app/immutable/nodes/5.CZMyJzy8.js +2 -0
- scorepilot/web/_app/immutable/nodes/6.VxIOLuZA.js +1 -0
- scorepilot/web/_app/version.json +1 -0
- scorepilot/web/index.html +41 -0
- scorepilot-0.1.2.dist-info/METADATA +173 -0
- scorepilot-0.1.2.dist-info/RECORD +62 -0
- scorepilot-0.1.2.dist-info/WHEEL +4 -0
- scorepilot-0.1.2.dist-info/entry_points.txt +2 -0
- scorepilot-0.1.2.dist-info/licenses/LICENSE +21 -0
scorepilot/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""FastAPI routers: a thin layer translating HTTP to core/db."""
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Dataset import and metadata endpoints.
|
|
2
|
+
|
|
3
|
+
Uploaded datasets live in an in-memory store keyed by a generated id. Import keeps
|
|
4
|
+
every column (identifiers and qualitative columns included) and infers each
|
|
5
|
+
column's data type. Roles like X/Y and exclusions are *not* set here - those are
|
|
6
|
+
modelling choices captured in a preprocessing spec.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Annotated
|
|
12
|
+
|
|
13
|
+
from fastapi import APIRouter, File, HTTPException, Query, UploadFile, status
|
|
14
|
+
|
|
15
|
+
from scorepilot.api.deps import DatasetStoreDep
|
|
16
|
+
from scorepilot.core import IdentifierRole
|
|
17
|
+
from scorepilot.dataset_store import Dataset, load_table
|
|
18
|
+
from scorepilot.schemas import ColumnMetaModel, ColumnUpdate, DatasetDetail
|
|
19
|
+
|
|
20
|
+
router = APIRouter(prefix="/datasets", tags=["datasets"])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def to_detail(dataset: Dataset) -> DatasetDetail:
|
|
24
|
+
"""Build the API detail model for a dataset."""
|
|
25
|
+
return DatasetDetail(
|
|
26
|
+
dataset_id=dataset.id,
|
|
27
|
+
name=dataset.name,
|
|
28
|
+
source=dataset.source,
|
|
29
|
+
sheet=dataset.sheet,
|
|
30
|
+
sheets=dataset.sheets,
|
|
31
|
+
n_rows=dataset.n_rows,
|
|
32
|
+
n_columns=dataset.n_columns,
|
|
33
|
+
primary_id=dataset.primary_id,
|
|
34
|
+
columns=[
|
|
35
|
+
ColumnMetaModel(
|
|
36
|
+
name=c.name,
|
|
37
|
+
column_type=c.column_type,
|
|
38
|
+
identifier_role=c.identifier_role,
|
|
39
|
+
)
|
|
40
|
+
for c in dataset.columns
|
|
41
|
+
],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@router.post("", response_model=DatasetDetail, status_code=status.HTTP_201_CREATED)
|
|
46
|
+
async def upload_dataset(
|
|
47
|
+
store: DatasetStoreDep,
|
|
48
|
+
file: Annotated[UploadFile, File(description="A CSV or Excel file")],
|
|
49
|
+
sheet: Annotated[str | None, Query(description="Excel sheet name")] = None,
|
|
50
|
+
) -> DatasetDetail:
|
|
51
|
+
"""Import a CSV or Excel file as a dataset."""
|
|
52
|
+
raw = await file.read()
|
|
53
|
+
filename = file.filename or "dataset.csv"
|
|
54
|
+
try:
|
|
55
|
+
frame, source, sheets, used = load_table(raw, filename, sheet)
|
|
56
|
+
except Exception as exc:
|
|
57
|
+
raise HTTPException(
|
|
58
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
59
|
+
detail=f"Could not parse file: {exc}",
|
|
60
|
+
) from exc
|
|
61
|
+
|
|
62
|
+
if frame.shape[1] == 0:
|
|
63
|
+
raise HTTPException(
|
|
64
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
65
|
+
detail="No columns found in the uploaded file.",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
dataset = store.add(filename, frame, source=source, sheets=sheets, sheet=used)
|
|
69
|
+
return to_detail(dataset)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@router.get("", response_model=list[DatasetDetail])
|
|
73
|
+
def list_datasets(store: DatasetStoreDep) -> list[DatasetDetail]:
|
|
74
|
+
"""List all imported datasets."""
|
|
75
|
+
return [to_detail(d) for d in store.list()]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@router.get("/{dataset_id}", response_model=DatasetDetail)
|
|
79
|
+
def get_dataset(dataset_id: str, store: DatasetStoreDep) -> DatasetDetail:
|
|
80
|
+
"""Return one dataset's metadata."""
|
|
81
|
+
return to_detail(_require(store, dataset_id))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@router.patch("/{dataset_id}/columns/{column}", response_model=DatasetDetail)
|
|
85
|
+
def update_column(
|
|
86
|
+
dataset_id: str, column: str, update: ColumnUpdate, store: DatasetStoreDep
|
|
87
|
+
) -> DatasetDetail:
|
|
88
|
+
"""Update a column's data type or identifier role."""
|
|
89
|
+
dataset = _require(store, dataset_id)
|
|
90
|
+
meta = dataset.column(column)
|
|
91
|
+
if meta is None:
|
|
92
|
+
raise HTTPException(
|
|
93
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
94
|
+
detail=f"Unknown column: {column}",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if update.column_type is not None:
|
|
98
|
+
meta.column_type = update.column_type
|
|
99
|
+
if update.identifier_role is not None:
|
|
100
|
+
if update.identifier_role is IdentifierRole.PRIMARY:
|
|
101
|
+
for other in dataset.columns:
|
|
102
|
+
if other.identifier_role is IdentifierRole.PRIMARY:
|
|
103
|
+
other.identifier_role = IdentifierRole.NONE
|
|
104
|
+
meta.identifier_role = update.identifier_role
|
|
105
|
+
|
|
106
|
+
return to_detail(dataset)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _require(store: DatasetStoreDep, dataset_id: str) -> Dataset:
|
|
110
|
+
dataset = store.get(dataset_id)
|
|
111
|
+
if dataset is None:
|
|
112
|
+
raise HTTPException(
|
|
113
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
114
|
+
detail=f"Unknown dataset_id: {dataset_id}",
|
|
115
|
+
)
|
|
116
|
+
return dataset
|
scorepilot/api/deps.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""FastAPI dependencies wiring requests to the dataset store and repository."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from fastapi import Depends, Request
|
|
9
|
+
from sqlalchemy.orm import Session
|
|
10
|
+
|
|
11
|
+
from scorepilot.dataset_store import DatasetStore
|
|
12
|
+
from scorepilot.db import SqlModelRepository, session_scope
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_dataset_store(request: Request) -> DatasetStore:
|
|
16
|
+
"""Return the process-wide in-memory dataset store."""
|
|
17
|
+
return request.app.state.dataset_store
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_session(request: Request) -> Iterator[Session]:
|
|
21
|
+
"""Yield a transactional session, committed when the request succeeds."""
|
|
22
|
+
with session_scope(request.app.state.session_factory) as session:
|
|
23
|
+
yield session
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_repository(
|
|
27
|
+
session: Annotated[Session, Depends(get_session)],
|
|
28
|
+
) -> SqlModelRepository:
|
|
29
|
+
"""Return a repository bound to the request's session."""
|
|
30
|
+
return SqlModelRepository(session)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
DatasetStoreDep = Annotated[DatasetStore, Depends(get_dataset_store)]
|
|
34
|
+
RepositoryDep = Annotated[SqlModelRepository, Depends(get_repository)]
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Exploration endpoints: data-quality, grid windows, and the variable inspector.
|
|
2
|
+
|
|
3
|
+
These are read-only views over an immutable dataset. The transform shown in the
|
|
4
|
+
inspector is a non-destructive *preview*; it never changes the stored data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Annotated
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from fastapi import APIRouter, HTTPException, Query, status
|
|
13
|
+
|
|
14
|
+
from scorepilot.api.deps import DatasetStoreDep
|
|
15
|
+
from scorepilot.core import (
|
|
16
|
+
PreprocessingSpec,
|
|
17
|
+
TransformKind,
|
|
18
|
+
apply_spec,
|
|
19
|
+
apply_transform,
|
|
20
|
+
histogram,
|
|
21
|
+
quality_report,
|
|
22
|
+
sequence,
|
|
23
|
+
suggest_transform,
|
|
24
|
+
variable_summary,
|
|
25
|
+
)
|
|
26
|
+
from scorepilot.core._pandas import column as get_column
|
|
27
|
+
from scorepilot.dataset_store import Dataset
|
|
28
|
+
from scorepilot.schemas import (
|
|
29
|
+
ColumnQualityModel,
|
|
30
|
+
DuplicateIdentifierModel,
|
|
31
|
+
GridWindow,
|
|
32
|
+
ObservationQualityModel,
|
|
33
|
+
QualityReportModel,
|
|
34
|
+
VariableInspector,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
router = APIRouter(prefix="/datasets", tags=["exploration"])
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _require(store: DatasetStoreDep, dataset_id: str) -> Dataset:
|
|
41
|
+
dataset = store.get(dataset_id)
|
|
42
|
+
if dataset is None:
|
|
43
|
+
raise HTTPException(
|
|
44
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
45
|
+
detail=f"Unknown dataset_id: {dataset_id}",
|
|
46
|
+
)
|
|
47
|
+
return dataset
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _format_cell(value: object) -> str | None:
|
|
51
|
+
if pd.isna(value): # type: ignore[arg-type]
|
|
52
|
+
return None
|
|
53
|
+
if isinstance(value, float):
|
|
54
|
+
return f"{value:.6g}"
|
|
55
|
+
return str(value)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@router.get("/{dataset_id}/quality", response_model=QualityReportModel)
|
|
59
|
+
def get_quality(dataset_id: str, store: DatasetStoreDep) -> QualityReportModel:
|
|
60
|
+
"""Return the data-quality report for a dataset."""
|
|
61
|
+
dataset = _require(store, dataset_id)
|
|
62
|
+
report = quality_report(dataset.raw, types=dataset.types(), primary_id=dataset.primary_id)
|
|
63
|
+
return QualityReportModel(
|
|
64
|
+
n_rows=report.n_rows,
|
|
65
|
+
n_columns=report.n_columns,
|
|
66
|
+
n_missing_cells=report.n_missing_cells,
|
|
67
|
+
pct_missing=report.pct_missing,
|
|
68
|
+
primary_id_unique=report.primary_id_unique,
|
|
69
|
+
duplicate_primary_ids=[
|
|
70
|
+
DuplicateIdentifierModel(value=d.value, rows=d.rows)
|
|
71
|
+
for d in report.duplicate_primary_ids
|
|
72
|
+
],
|
|
73
|
+
columns=[
|
|
74
|
+
ColumnQualityModel(
|
|
75
|
+
name=c.name,
|
|
76
|
+
n_missing=c.n_missing,
|
|
77
|
+
pct_missing=c.pct_missing,
|
|
78
|
+
n_invalid=c.n_invalid,
|
|
79
|
+
invalid_rows=c.invalid_rows,
|
|
80
|
+
exceeds_tolerance=c.exceeds_tolerance,
|
|
81
|
+
)
|
|
82
|
+
for c in report.columns
|
|
83
|
+
],
|
|
84
|
+
observations_exceeding=[
|
|
85
|
+
ObservationQualityModel(
|
|
86
|
+
index=o.index,
|
|
87
|
+
identifier=o.identifier,
|
|
88
|
+
n_missing=o.n_missing,
|
|
89
|
+
pct_missing=o.pct_missing,
|
|
90
|
+
)
|
|
91
|
+
for o in report.observations_exceeding
|
|
92
|
+
],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@router.get("/{dataset_id}/grid", response_model=GridWindow)
|
|
97
|
+
def get_grid(
|
|
98
|
+
dataset_id: str,
|
|
99
|
+
store: DatasetStoreDep,
|
|
100
|
+
row_offset: Annotated[int, Query(ge=0)] = 0,
|
|
101
|
+
row_limit: Annotated[int, Query(ge=1, le=1000)] = 100,
|
|
102
|
+
col_offset: Annotated[int, Query(ge=0)] = 0,
|
|
103
|
+
col_limit: Annotated[int, Query(ge=1, le=200)] = 50,
|
|
104
|
+
form: Annotated[str, Query(pattern="^(raw|scaled)$")] = "raw",
|
|
105
|
+
) -> GridWindow:
|
|
106
|
+
"""Return a windowed block of cells for the grid, raw or autoscaled."""
|
|
107
|
+
dataset = _require(store, dataset_id)
|
|
108
|
+
display = _display_frame(dataset, form)
|
|
109
|
+
|
|
110
|
+
all_columns = [str(c) for c in display.columns]
|
|
111
|
+
column_names = all_columns[col_offset : col_offset + col_limit]
|
|
112
|
+
window = display.iloc[row_offset : row_offset + row_limit][column_names]
|
|
113
|
+
|
|
114
|
+
row_identifiers = _row_identifiers(dataset, row_offset, len(window))
|
|
115
|
+
cells = [
|
|
116
|
+
[_format_cell(window.iat[r, c]) for c in range(window.shape[1])]
|
|
117
|
+
for r in range(window.shape[0])
|
|
118
|
+
]
|
|
119
|
+
return GridWindow(
|
|
120
|
+
row_offset=row_offset,
|
|
121
|
+
column_names=column_names,
|
|
122
|
+
row_identifiers=row_identifiers,
|
|
123
|
+
cells=cells,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _display_frame(dataset: Dataset, form: str) -> pd.DataFrame:
|
|
128
|
+
if form != "scaled":
|
|
129
|
+
return dataset.raw
|
|
130
|
+
quantitative = dataset.quantitative_columns()
|
|
131
|
+
if not quantitative:
|
|
132
|
+
return dataset.raw
|
|
133
|
+
# Scale using full-column statistics so the view is stable across scrolling.
|
|
134
|
+
scaled = apply_spec(dataset.raw, PreprocessingSpec(x_columns=tuple(quantitative))).X
|
|
135
|
+
display = dataset.raw.copy()
|
|
136
|
+
for name in quantitative:
|
|
137
|
+
display[name] = scaled[name]
|
|
138
|
+
return display
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _row_identifiers(dataset: Dataset, row_offset: int, count: int) -> list[str | None]:
|
|
142
|
+
if dataset.primary_id is not None:
|
|
143
|
+
ids = get_column(dataset.raw, dataset.primary_id).iloc[row_offset : row_offset + count]
|
|
144
|
+
return [_format_cell(v) for v in ids]
|
|
145
|
+
return [str(row_offset + i) for i in range(count)]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@router.get("/{dataset_id}/variables/{column}", response_model=VariableInspector)
|
|
149
|
+
def inspect_variable(
|
|
150
|
+
dataset_id: str,
|
|
151
|
+
column: str,
|
|
152
|
+
store: DatasetStoreDep,
|
|
153
|
+
transform: Annotated[TransformKind, Query()] = TransformKind.NONE,
|
|
154
|
+
c1: Annotated[float, Query()] = 0.0,
|
|
155
|
+
c2: Annotated[float, Query()] = 1.0,
|
|
156
|
+
) -> VariableInspector:
|
|
157
|
+
"""Return summary, histogram, and sequence for a variable (with optional preview)."""
|
|
158
|
+
dataset = _require(store, dataset_id)
|
|
159
|
+
meta = dataset.column(column)
|
|
160
|
+
if meta is None:
|
|
161
|
+
raise HTTPException(
|
|
162
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
163
|
+
detail=f"Unknown column: {column}",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
series = get_column(dataset.raw, column)
|
|
167
|
+
suggested = suggest_transform(variable_summary(series, column_type=meta.column_type))
|
|
168
|
+
|
|
169
|
+
display = series
|
|
170
|
+
if transform is not TransformKind.NONE:
|
|
171
|
+
display = apply_transform(series, transform, c1=c1, c2=c2)
|
|
172
|
+
summary = variable_summary(display, column_type=meta.column_type)
|
|
173
|
+
counts, edges = histogram(display)
|
|
174
|
+
|
|
175
|
+
return VariableInspector(
|
|
176
|
+
name=meta.name,
|
|
177
|
+
column_type=meta.column_type,
|
|
178
|
+
n=summary.n,
|
|
179
|
+
n_missing=summary.n_missing,
|
|
180
|
+
pct_missing=summary.pct_missing,
|
|
181
|
+
n_unique=summary.n_unique,
|
|
182
|
+
mean=summary.mean,
|
|
183
|
+
std=summary.std,
|
|
184
|
+
minimum=summary.minimum,
|
|
185
|
+
maximum=summary.maximum,
|
|
186
|
+
median=summary.median,
|
|
187
|
+
q25=summary.q25,
|
|
188
|
+
q75=summary.q75,
|
|
189
|
+
skewness=summary.skewness,
|
|
190
|
+
min_max_ratio=summary.min_max_ratio,
|
|
191
|
+
suggested_transform=suggested,
|
|
192
|
+
histogram_counts=counts,
|
|
193
|
+
histogram_edges=edges,
|
|
194
|
+
sequence=sequence(display),
|
|
195
|
+
)
|
scorepilot/api/models.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Model-fitting endpoints, the Hangar (list), and the Logbook (detail)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from fastapi import APIRouter, HTTPException, status
|
|
9
|
+
|
|
10
|
+
from scorepilot.api.deps import DatasetStoreDep, RepositoryDep
|
|
11
|
+
from scorepilot.core import (
|
|
12
|
+
ModelDiagnostics,
|
|
13
|
+
PreprocessingSpec,
|
|
14
|
+
apply_spec,
|
|
15
|
+
fit_model,
|
|
16
|
+
)
|
|
17
|
+
from scorepilot.dataset_store import Dataset
|
|
18
|
+
from scorepilot.db import Model
|
|
19
|
+
from scorepilot.schemas import (
|
|
20
|
+
FitModelRequest,
|
|
21
|
+
FitPCARequest,
|
|
22
|
+
LoadingsPayload,
|
|
23
|
+
ModelDetail,
|
|
24
|
+
ModelDiagnosticsModel,
|
|
25
|
+
ModelSummary,
|
|
26
|
+
PCAFitResponse,
|
|
27
|
+
ScoresPayload,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
router = APIRouter(prefix="/models", tags=["models"])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# --- helpers ----------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _pack_params(diag: ModelDiagnostics) -> bytes:
|
|
37
|
+
buffer = io.BytesIO()
|
|
38
|
+
np.savez_compressed(
|
|
39
|
+
buffer,
|
|
40
|
+
x_loadings=diag.x_loadings.to_numpy(),
|
|
41
|
+
explained_variance=diag.explained_variance,
|
|
42
|
+
r2_cumulative=np.asarray(diag.r2_cumulative),
|
|
43
|
+
)
|
|
44
|
+
return buffer.getvalue()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _default_spec(dataset: Dataset, spec: PreprocessingSpec | None) -> PreprocessingSpec:
|
|
48
|
+
if spec is not None:
|
|
49
|
+
return spec
|
|
50
|
+
return PreprocessingSpec(x_columns=tuple(dataset.quantitative_columns()))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _run_fit(
|
|
54
|
+
dataset: Dataset, spec: PreprocessingSpec, kind: str, n_components: int, conf_level: float
|
|
55
|
+
) -> ModelDiagnostics:
|
|
56
|
+
applied = apply_spec(dataset.raw, spec)
|
|
57
|
+
return fit_model(applied.X, applied.Y, kind, n_components, conf_level=conf_level)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _summary(model: Model) -> ModelSummary:
|
|
61
|
+
return ModelSummary(
|
|
62
|
+
id=model.id,
|
|
63
|
+
kind=model.kind,
|
|
64
|
+
name=model.name,
|
|
65
|
+
n_components=model.n_components,
|
|
66
|
+
parent_id=model.parent_id,
|
|
67
|
+
dataset_id=model.dataset_id,
|
|
68
|
+
created_at=model.created_at,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _diagnostics_model(diag: ModelDiagnostics) -> ModelDiagnosticsModel:
|
|
73
|
+
y_loadings = None
|
|
74
|
+
if diag.y_loadings is not None:
|
|
75
|
+
y_loadings = LoadingsPayload(
|
|
76
|
+
component_names=diag.component_names,
|
|
77
|
+
variable_names=diag.y_variable_names,
|
|
78
|
+
data=diag.y_loadings.to_numpy().tolist(),
|
|
79
|
+
)
|
|
80
|
+
return ModelDiagnosticsModel(
|
|
81
|
+
kind=diag.kind,
|
|
82
|
+
n_components=diag.n_components,
|
|
83
|
+
conf_level=diag.conf_level,
|
|
84
|
+
component_names=diag.component_names,
|
|
85
|
+
explained_variance=diag.explained_variance.tolist(),
|
|
86
|
+
r2_per_component=diag.r2_per_component,
|
|
87
|
+
r2_cumulative=diag.r2_cumulative,
|
|
88
|
+
scores=ScoresPayload(
|
|
89
|
+
component_names=diag.component_names,
|
|
90
|
+
observation_names=diag.observation_names,
|
|
91
|
+
data=diag.scores.to_numpy().tolist(),
|
|
92
|
+
),
|
|
93
|
+
x_loadings=LoadingsPayload(
|
|
94
|
+
component_names=diag.component_names,
|
|
95
|
+
variable_names=diag.x_variable_names,
|
|
96
|
+
data=diag.x_loadings.to_numpy().tolist(),
|
|
97
|
+
),
|
|
98
|
+
y_loadings=y_loadings,
|
|
99
|
+
hotellings_t2=diag.hotellings_t2.to_numpy().tolist(),
|
|
100
|
+
spe=diag.spe.to_numpy().tolist(),
|
|
101
|
+
t2_limit=diag.t2_limit,
|
|
102
|
+
spe_limit=diag.spe_limit,
|
|
103
|
+
ellipse_x=diag.ellipse_x,
|
|
104
|
+
ellipse_y=diag.ellipse_y,
|
|
105
|
+
vip=diag.vip,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _require_dataset(store: DatasetStoreDep, dataset_id: str) -> Dataset:
|
|
110
|
+
dataset = store.get(dataset_id)
|
|
111
|
+
if dataset is None:
|
|
112
|
+
raise HTTPException(
|
|
113
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
114
|
+
detail=f"Unknown dataset_id: {dataset_id}",
|
|
115
|
+
)
|
|
116
|
+
return dataset
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# --- endpoints --------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@router.post("", response_model=ModelDetail, status_code=status.HTTP_201_CREATED)
|
|
123
|
+
def fit_model_endpoint(
|
|
124
|
+
request: FitModelRequest, store: DatasetStoreDep, repository: RepositoryDep
|
|
125
|
+
) -> ModelDetail:
|
|
126
|
+
"""Fit a PCA/PLS model variant from a dataset and a spec, and persist it."""
|
|
127
|
+
dataset = _require_dataset(store, request.dataset_id)
|
|
128
|
+
if request.parent_id is not None and repository.get(request.parent_id) is None:
|
|
129
|
+
raise HTTPException(
|
|
130
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
131
|
+
detail=f"Unknown parent_id: {request.parent_id}",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
spec = _default_spec(dataset, request.spec.to_core() if request.spec else None)
|
|
135
|
+
try:
|
|
136
|
+
diag = _run_fit(dataset, spec, request.kind, request.n_components, request.conf_level)
|
|
137
|
+
except ValueError as exc:
|
|
138
|
+
raise HTTPException(
|
|
139
|
+
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT, detail=str(exc)
|
|
140
|
+
) from exc
|
|
141
|
+
|
|
142
|
+
model = repository.add(
|
|
143
|
+
Model(
|
|
144
|
+
kind=request.kind,
|
|
145
|
+
name=request.name,
|
|
146
|
+
dataset_id=dataset.id,
|
|
147
|
+
n_components=diag.n_components,
|
|
148
|
+
preprocessing=spec.to_dict(),
|
|
149
|
+
excluded_samples=list(spec.excluded_rows),
|
|
150
|
+
params=_pack_params(diag),
|
|
151
|
+
parent_id=request.parent_id,
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
return ModelDetail(
|
|
155
|
+
summary=_summary(model),
|
|
156
|
+
preprocessing=spec.to_dict(),
|
|
157
|
+
excluded_samples=list(spec.excluded_rows),
|
|
158
|
+
lineage=[_summary(m) for m in repository.lineage(model.id)],
|
|
159
|
+
diagnostics=_diagnostics_model(diag),
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@router.get("", response_model=list[ModelSummary])
|
|
164
|
+
def list_models(repository: RepositoryDep) -> list[ModelSummary]:
|
|
165
|
+
"""List all model variants (the Hangar)."""
|
|
166
|
+
return [_summary(m) for m in repository.list()]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@router.get("/{model_id}", response_model=ModelDetail)
|
|
170
|
+
def get_model(model_id: int, store: DatasetStoreDep, repository: RepositoryDep) -> ModelDetail:
|
|
171
|
+
"""Return a model's Logbook: metadata, recipe, lineage, and diagnostics.
|
|
172
|
+
|
|
173
|
+
Diagnostics are recomputed from the source dataset and stored spec. If the
|
|
174
|
+
dataset is no longer in memory, the entry is returned without diagnostics.
|
|
175
|
+
"""
|
|
176
|
+
model = repository.get(model_id)
|
|
177
|
+
if model is None:
|
|
178
|
+
raise HTTPException(
|
|
179
|
+
status_code=status.HTTP_404_NOT_FOUND, detail=f"Unknown model id: {model_id}"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
diagnostics = None
|
|
183
|
+
dataset = store.get(model.dataset_id) if model.dataset_id else None
|
|
184
|
+
if dataset is not None:
|
|
185
|
+
spec = PreprocessingSpec.from_dict(model.preprocessing)
|
|
186
|
+
try:
|
|
187
|
+
diag = _run_fit(dataset, spec, model.kind, model.n_components, 0.95)
|
|
188
|
+
diagnostics = _diagnostics_model(diag)
|
|
189
|
+
except ValueError:
|
|
190
|
+
diagnostics = None
|
|
191
|
+
|
|
192
|
+
return ModelDetail(
|
|
193
|
+
summary=_summary(model),
|
|
194
|
+
preprocessing=dict(model.preprocessing),
|
|
195
|
+
excluded_samples=list(model.excluded_samples),
|
|
196
|
+
lineage=[_summary(m) for m in repository.lineage(model.id)],
|
|
197
|
+
diagnostics=diagnostics,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@router.post("/pca", response_model=PCAFitResponse, status_code=status.HTTP_201_CREATED)
|
|
202
|
+
def fit_pca_model(
|
|
203
|
+
request: FitPCARequest, store: DatasetStoreDep, repository: RepositoryDep
|
|
204
|
+
) -> PCAFitResponse:
|
|
205
|
+
"""Fit a PCA model (legacy endpoint used by the scores playground)."""
|
|
206
|
+
dataset = _require_dataset(store, request.dataset_id)
|
|
207
|
+
spec = _default_spec(dataset, request.spec.to_core() if request.spec else None)
|
|
208
|
+
try:
|
|
209
|
+
diag = _run_fit(dataset, spec, "PCA", request.n_components, request.conf_level)
|
|
210
|
+
except ValueError as exc:
|
|
211
|
+
raise HTTPException(
|
|
212
|
+
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT, detail=str(exc)
|
|
213
|
+
) from exc
|
|
214
|
+
|
|
215
|
+
model = repository.add(
|
|
216
|
+
Model(
|
|
217
|
+
kind="PCA",
|
|
218
|
+
name=request.name,
|
|
219
|
+
dataset_id=dataset.id,
|
|
220
|
+
n_components=diag.n_components,
|
|
221
|
+
preprocessing=spec.to_dict(),
|
|
222
|
+
excluded_samples=list(spec.excluded_rows),
|
|
223
|
+
params=_pack_params(diag),
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
return PCAFitResponse(
|
|
227
|
+
model_id=model.id,
|
|
228
|
+
kind="PCA",
|
|
229
|
+
n_components=diag.n_components,
|
|
230
|
+
conf_level=diag.conf_level,
|
|
231
|
+
component_names=diag.component_names,
|
|
232
|
+
explained_variance=diag.explained_variance.tolist(),
|
|
233
|
+
r2_cumulative=diag.r2_cumulative,
|
|
234
|
+
scores=ScoresPayload(
|
|
235
|
+
component_names=diag.component_names,
|
|
236
|
+
observation_names=diag.observation_names,
|
|
237
|
+
data=diag.scores.to_numpy().tolist(),
|
|
238
|
+
),
|
|
239
|
+
hotellings_t2=diag.hotellings_t2.to_numpy().tolist(),
|
|
240
|
+
spe=diag.spe.to_numpy().tolist(),
|
|
241
|
+
t2_limit=diag.t2_limit,
|
|
242
|
+
spe_limit=diag.spe_limit,
|
|
243
|
+
)
|
scorepilot/app.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""FastAPI application factory.
|
|
2
|
+
|
|
3
|
+
The packaged app is a single process: the API is served under ``/api`` and the
|
|
4
|
+
built static frontend at ``/``. A small ``StaticFiles`` subclass falls back to
|
|
5
|
+
``index.html`` so client-side (SPA) routes resolve.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from fastapi import FastAPI
|
|
13
|
+
from fastapi.staticfiles import StaticFiles
|
|
14
|
+
from starlette.exceptions import HTTPException as StarletteHTTPException
|
|
15
|
+
from starlette.types import Scope
|
|
16
|
+
|
|
17
|
+
from scorepilot.api import datasets, exploration, models
|
|
18
|
+
from scorepilot.config import Settings, get_settings
|
|
19
|
+
from scorepilot.dataset_store import DatasetStore
|
|
20
|
+
from scorepilot.db import Base, make_engine, make_session_factory
|
|
21
|
+
|
|
22
|
+
WEB_DIR = Path(__file__).resolve().parent / "web"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SpaStaticFiles(StaticFiles):
|
|
26
|
+
"""Static files that fall back to ``index.html`` on a 404.
|
|
27
|
+
|
|
28
|
+
This lets the single-page app own client-side routing while non-existent
|
|
29
|
+
asset paths still return a real 404.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
async def get_response(self, path: str, scope: Scope): # noqa: ANN201
|
|
33
|
+
try:
|
|
34
|
+
return await super().get_response(path, scope)
|
|
35
|
+
except StarletteHTTPException as exc:
|
|
36
|
+
if exc.status_code == 404:
|
|
37
|
+
return await super().get_response("index.html", scope)
|
|
38
|
+
raise
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def create_app(settings: Settings | None = None) -> FastAPI:
|
|
42
|
+
"""Build and configure the FastAPI application.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
settings
|
|
47
|
+
Optional settings override (used by tests). Defaults to the cached
|
|
48
|
+
environment-derived settings.
|
|
49
|
+
"""
|
|
50
|
+
settings = settings or get_settings()
|
|
51
|
+
|
|
52
|
+
engine = make_engine(settings.database_url)
|
|
53
|
+
# Convenience for local/dev use: ensure tables exist. Production schema
|
|
54
|
+
# changes are managed with Alembic.
|
|
55
|
+
Base.metadata.create_all(engine)
|
|
56
|
+
|
|
57
|
+
app = FastAPI(
|
|
58
|
+
title="ScorePilot",
|
|
59
|
+
version="0.1.0",
|
|
60
|
+
docs_url="/api/docs",
|
|
61
|
+
redoc_url=None,
|
|
62
|
+
openapi_url="/api/openapi.json",
|
|
63
|
+
)
|
|
64
|
+
app.state.settings = settings
|
|
65
|
+
app.state.engine = engine
|
|
66
|
+
app.state.session_factory = make_session_factory(engine)
|
|
67
|
+
app.state.dataset_store = DatasetStore()
|
|
68
|
+
|
|
69
|
+
@app.get("/api/health", tags=["meta"])
|
|
70
|
+
def health() -> dict[str, str]:
|
|
71
|
+
return {"status": "ok"}
|
|
72
|
+
|
|
73
|
+
app.include_router(datasets.router, prefix="/api")
|
|
74
|
+
app.include_router(exploration.router, prefix="/api")
|
|
75
|
+
app.include_router(models.router, prefix="/api")
|
|
76
|
+
|
|
77
|
+
# Mount the static SPA last so it never shadows the API routes above.
|
|
78
|
+
app.mount("/", SpaStaticFiles(directory=WEB_DIR, html=True), name="web")
|
|
79
|
+
|
|
80
|
+
return app
|