PyPI - arize-phoenix - Versions diffs - 4.14.1__py3-none-any.whl → 4.16.0__py3-none-any.whl - Mend

arize-phoenix 4.14.1py3-none-any.whl → 4.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of arize-phoenix might be problematic. Click here for more details.

Files changed (85) hide show

{arize_phoenix-4.14.1.dist-info → arize_phoenix-4.16.0.dist-info}/METADATA +5 -3
{arize_phoenix-4.14.1.dist-info → arize_phoenix-4.16.0.dist-info}/RECORD +81 -71
phoenix/db/bulk_inserter.py +131 -5
phoenix/db/engines.py +2 -1
phoenix/db/helpers.py +23 -1
phoenix/db/insertion/constants.py +2 -0
phoenix/db/insertion/document_annotation.py +157 -0
phoenix/db/insertion/helpers.py +13 -0
phoenix/db/insertion/span_annotation.py +144 -0
phoenix/db/insertion/trace_annotation.py +144 -0
phoenix/db/insertion/types.py +261 -0
phoenix/experiments/functions.py +3 -2
phoenix/experiments/types.py +3 -3
phoenix/server/api/context.py +7 -9
phoenix/server/api/dataloaders/__init__.py +2 -0
phoenix/server/api/dataloaders/average_experiment_run_latency.py +3 -3
phoenix/server/api/dataloaders/dataset_example_revisions.py +2 -4
phoenix/server/api/dataloaders/dataset_example_spans.py +2 -4
phoenix/server/api/dataloaders/document_evaluation_summaries.py +2 -4
phoenix/server/api/dataloaders/document_evaluations.py +2 -4
phoenix/server/api/dataloaders/document_retrieval_metrics.py +2 -4
phoenix/server/api/dataloaders/evaluation_summaries.py +2 -4
phoenix/server/api/dataloaders/experiment_annotation_summaries.py +2 -4
phoenix/server/api/dataloaders/experiment_error_rates.py +2 -4
phoenix/server/api/dataloaders/experiment_run_counts.py +2 -4
phoenix/server/api/dataloaders/experiment_sequence_number.py +2 -4
phoenix/server/api/dataloaders/latency_ms_quantile.py +2 -3
phoenix/server/api/dataloaders/min_start_or_max_end_times.py +2 -4
phoenix/server/api/dataloaders/project_by_name.py +3 -3
phoenix/server/api/dataloaders/record_counts.py +2 -4
phoenix/server/api/dataloaders/span_annotations.py +2 -4
phoenix/server/api/dataloaders/span_dataset_examples.py +36 -0
phoenix/server/api/dataloaders/span_descendants.py +2 -4
phoenix/server/api/dataloaders/span_evaluations.py +2 -4
phoenix/server/api/dataloaders/span_projects.py +3 -3
phoenix/server/api/dataloaders/token_counts.py +2 -4
phoenix/server/api/dataloaders/trace_evaluations.py +2 -4
phoenix/server/api/dataloaders/trace_row_ids.py +2 -4
phoenix/server/api/input_types/SpanAnnotationSort.py +17 -0
phoenix/server/api/input_types/TraceAnnotationSort.py +17 -0
phoenix/server/api/mutations/span_annotations_mutations.py +8 -3
phoenix/server/api/mutations/trace_annotations_mutations.py +8 -3
phoenix/server/api/openapi/main.py +18 -2
phoenix/server/api/openapi/schema.py +12 -12
phoenix/server/api/routers/v1/__init__.py +36 -83
phoenix/server/api/routers/v1/datasets.py +515 -509
phoenix/server/api/routers/v1/evaluations.py +164 -73
phoenix/server/api/routers/v1/experiment_evaluations.py +68 -91
phoenix/server/api/routers/v1/experiment_runs.py +98 -155
phoenix/server/api/routers/v1/experiments.py +132 -181
phoenix/server/api/routers/v1/pydantic_compat.py +78 -0
phoenix/server/api/routers/v1/spans.py +164 -203
phoenix/server/api/routers/v1/traces.py +134 -159
phoenix/server/api/routers/v1/utils.py +95 -0
phoenix/server/api/types/Span.py +27 -3
phoenix/server/api/types/Trace.py +21 -4
phoenix/server/api/utils.py +4 -4
phoenix/server/app.py +172 -192
phoenix/server/grpc_server.py +2 -2
phoenix/server/main.py +5 -9
phoenix/server/static/.vite/manifest.json +31 -31
phoenix/server/static/assets/components-Ci5kMOk5.js +1175 -0
phoenix/server/static/assets/{index-CQgXRwU0.js → index-BQG5WVX7.js} +2 -2
phoenix/server/static/assets/{pages-hdjlFZhO.js → pages-BrevprVW.js} +451 -275
phoenix/server/static/assets/{vendor-DPvSDRn3.js → vendor-CP0b0YG0.js} +2 -2
phoenix/server/static/assets/{vendor-arizeai-CkvPT67c.js → vendor-arizeai-DTbiPGp6.js} +27 -27
phoenix/server/static/assets/vendor-codemirror-DtdPDzrv.js +15 -0
phoenix/server/static/assets/{vendor-recharts-5jlNaZuF.js → vendor-recharts-A0DA1O99.js} +1 -1
phoenix/server/thread_server.py +2 -2
phoenix/server/types.py +18 -0
phoenix/session/client.py +5 -3
phoenix/session/session.py +2 -2
phoenix/trace/dsl/filter.py +2 -6
phoenix/trace/fixtures.py +17 -23
phoenix/trace/utils.py +23 -0
phoenix/utilities/client.py +116 -0
phoenix/utilities/project.py +1 -1
phoenix/version.py +1 -1
phoenix/server/api/routers/v1/dataset_examples.py +0 -178
phoenix/server/openapi/docs.py +0 -221
phoenix/server/static/assets/components-DeS0YEmv.js +0 -1142
phoenix/server/static/assets/vendor-codemirror-Cqwpwlua.js +0 -12
{arize_phoenix-4.14.1.dist-info → arize_phoenix-4.16.0.dist-info}/WHEEL +0 -0
{arize_phoenix-4.14.1.dist-info → arize_phoenix-4.16.0.dist-info}/licenses/IP_NOTICE +0 -0
{arize_phoenix-4.14.1.dist-info → arize_phoenix-4.16.0.dist-info}/licenses/LICENSE +0 -0

phoenix/server/api/routers/v1/datasets.py CHANGED Viewed

@@ -6,6 +6,7 @@ import logging
 import zlib
 from asyncio import QueueFull
 from collections import Counter
+from datetime import datetime
 from enum import Enum
 from functools import partial
 from typing import (
@@ -13,6 +14,7 @@ from typing import (
     Awaitable,
     Callable,
     Coroutine,
+    Dict,
     FrozenSet,
     Iterator,
     List,
@@ -26,14 +28,16 @@ from typing import (
 import pandas as pd
 import pyarrow as pa
+from fastapi import APIRouter, BackgroundTasks, HTTPException, Path, Query
+from fastapi.responses import PlainTextResponse, StreamingResponse
 from sqlalchemy import and_, delete, func, select
 from sqlalchemy.ext.asyncio import AsyncSession
-from starlette.background import BackgroundTasks
 from starlette.concurrency import run_in_threadpool
 from starlette.datastructures import FormData, UploadFile
 from starlette.requests import Request
-from starlette.responses import JSONResponse, Response
+from starlette.responses import Response
 from starlette.status import (
+    HTTP_200_OK,
     HTTP_204_NO_CONTENT,
     HTTP_404_NOT_FOUND,
     HTTP_409_CONFLICT,
@@ -51,79 +55,59 @@ from phoenix.db.insertion.dataset import (
     ExampleContent,
     add_dataset_examples,
 )
-from phoenix.server.api.types.Dataset import Dataset
-from phoenix.server.api.types.DatasetExample import DatasetExample
-from phoenix.server.api.types.DatasetVersion import DatasetVersion
+from phoenix.server.api.types.Dataset import Dataset as DatasetNodeType
+from phoenix.server.api.types.DatasetExample import DatasetExample as DatasetExampleNodeType
+from phoenix.server.api.types.DatasetVersion import DatasetVersion as DatasetVersionNodeType
 from phoenix.server.api.types.node import from_global_id_with_expected_type
 from phoenix.server.api.utils import delete_projects, delete_traces
+from .pydantic_compat import V1RoutesBaseModel
+from .utils import (
+    PaginatedResponseBody,
+    ResponseBody,
+    add_errors_to_responses,
+    add_text_csv_content_to_responses,
+)
 logger = logging.getLogger(__name__)
-NODE_NAME = "Dataset"
-async def list_datasets(request: Request) -> Response:
-    """
-    summary: List datasets with cursor-based pagination
-    operationId: listDatasets
-    tags:
-      - datasets
-    parameters:
-      - in: query
-        name: cursor
-        required: false
-        schema:
-          type: string
-        description: Cursor for pagination
-      - in: query
-        name: limit
-        required: false
-        schema:
-          type: integer
-          default: 10
-      - in: query
-        name: name
-        required: false
-        schema:
-          type: string
-        description: match by dataset name
-    responses:
-      200:
-        description: A paginated list of datasets
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                next_cursor:
-                  type: string
-                data:
-                  type: array
-                  items:
-                    type: object
-                    properties:
-                      id:
-                        type: string
-                      name:
-                        type: string
-                      description:
-                        type: string
-                      metadata:
-                        type: object
-                      created_at:
-                        type: string
-                        format: date-time
-                      updated_at:
-                        type: string
-                        format: date-time
-      403:
-        description: Forbidden
-      404:
-        description: No datasets found
-    """
-    name = request.query_params.get("name")
-    cursor = request.query_params.get("cursor")
-    limit = int(request.query_params.get("limit", 10))
+DATASET_NODE_NAME = DatasetNodeType.__name__
+DATASET_VERSION_NODE_NAME = DatasetVersionNodeType.__name__
+router = APIRouter(tags=["datasets"])
+class Dataset(V1RoutesBaseModel):
+    id: str
+    name: str
+    description: Optional[str]
+    metadata: Dict[str, Any]
+    created_at: datetime
+    updated_at: datetime
+class ListDatasetsResponseBody(PaginatedResponseBody[Dataset]):
+    pass
+@router.get(
+    "/datasets",
+    operation_id="listDatasets",
+    summary="List datasets",
+    responses=add_errors_to_responses([HTTP_422_UNPROCESSABLE_ENTITY]),
+)
+async def list_datasets(
+    request: Request,
+    cursor: Optional[str] = Query(
+        default=None,
+        description="Cursor for pagination",
+    ),
+    name: Optional[str] = Query(default=None, description="An optional dataset name to filter by"),
+    limit: int = Query(
+        default=10, description="The max number of datasets to return at a time.", gt=0
+    ),
+) -> ListDatasetsResponseBody:
     async with request.app.state.db() as session:
         query = select(models.Dataset).order_by(models.Dataset.id.desc())
@@ -132,79 +116,68 @@ async def list_datasets(request: Request) -> Response:
                 cursor_id = GlobalID.from_id(cursor).node_id
                 query = query.filter(models.Dataset.id <= int(cursor_id))
             except ValueError:
-                return Response(
-                    content=f"Invalid cursor format: {cursor}",
+                raise HTTPException(
+                    detail=f"Invalid cursor format: {cursor}",
                     status_code=HTTP_422_UNPROCESSABLE_ENTITY,
                 )
         if name:
-            query = query.filter(models.Dataset.name.is_(name))
+            query = query.filter(models.Dataset.name == name)
         query = query.limit(limit + 1)
         result = await session.execute(query)
         datasets = result.scalars().all()
         if not datasets:
-            return JSONResponse(content={"next_cursor": None, "data": []}, status_code=200)
+            return ListDatasetsResponseBody(next_cursor=None, data=[])
         next_cursor = None
         if len(datasets) == limit + 1:
-            next_cursor = str(GlobalID(NODE_NAME, str(datasets[-1].id)))
+            next_cursor = str(GlobalID(DATASET_NODE_NAME, str(datasets[-1].id)))
             datasets = datasets[:-1]
         data = []
         for dataset in datasets:
             data.append(
-                {
-                    "id": str(GlobalID(NODE_NAME, str(dataset.id))),
-                    "name": dataset.name,
-                    "description": dataset.description,
-                    "metadata": dataset.metadata_,
-                    "created_at": dataset.created_at.isoformat(),
-                    "updated_at": dataset.updated_at.isoformat(),
-                }
+                Dataset(
+                    id=str(GlobalID(DATASET_NODE_NAME, str(dataset.id))),
+                    name=dataset.name,
+                    description=dataset.description,
+                    metadata=dataset.metadata_,
+                    created_at=dataset.created_at,
+                    updated_at=dataset.updated_at,
+                )
             )
-        return JSONResponse(content={"next_cursor": next_cursor, "data": data})
-async def delete_dataset_by_id(request: Request) -> Response:
-    """
-    summary: Delete dataset by ID
-    operationId: deleteDatasetById
-    tags:
-      - datasets
-    parameters:
-      - in: path
-        name: id
-        required: true
-        schema:
-          type: string
-    responses:
-      204:
-        description: Success
-      403:
-        description: Forbidden
-      404:
-        description: Dataset not found
-      422:
-        description: Dataset ID is invalid
-    """
-    if id_ := request.path_params.get("id"):
+        return ListDatasetsResponseBody(next_cursor=next_cursor, data=data)
+@router.delete(
+    "/datasets/{id}",
+    operation_id="deleteDatasetById",
+    summary="Delete dataset by ID",
+    status_code=HTTP_204_NO_CONTENT,
+    responses=add_errors_to_responses(
+        [
+            {"status_code": HTTP_404_NOT_FOUND, "description": "Dataset not found"},
+            {"status_code": HTTP_422_UNPROCESSABLE_ENTITY, "description": "Invalid dataset ID"},
+        ]
+    ),
+)
+async def delete_dataset(
+    request: Request, id: str = Path(description="The ID of the dataset to delete.")
+) -> None:
+    if id:
         try:
             dataset_id = from_global_id_with_expected_type(
-                GlobalID.from_id(id_),
-                Dataset.__name__,
+                GlobalID.from_id(id),
+                DATASET_NODE_NAME,
             )
         except ValueError:
-            return Response(
-                content=f"Invalid Dataset ID: {id_}",
-                status_code=HTTP_422_UNPROCESSABLE_ENTITY,
+            raise HTTPException(
+                detail=f"Invalid Dataset ID: {id}", status_code=HTTP_422_UNPROCESSABLE_ENTITY
             )
     else:
-        return Response(
-            content="Missing Dataset ID",
-            status_code=HTTP_422_UNPROCESSABLE_ENTITY,
-        )
+        raise HTTPException(detail="Missing Dataset ID", status_code=HTTP_422_UNPROCESSABLE_ENTITY)
     project_names_stmt = get_project_names_for_datasets(dataset_id)
     eval_trace_ids_stmt = get_eval_trace_ids_for_datasets(dataset_id)
     stmt = (
@@ -214,59 +187,34 @@ async def delete_dataset_by_id(request: Request) -> Response:
         project_names = await session.scalars(project_names_stmt)
         eval_trace_ids = await session.scalars(eval_trace_ids_stmt)
         if (await session.scalar(stmt)) is None:
-            return Response(content="Dataset does not exist", status_code=HTTP_404_NOT_FOUND)
+            raise HTTPException(detail="Dataset does not exist", status_code=HTTP_404_NOT_FOUND)
     tasks = BackgroundTasks()
     tasks.add_task(delete_projects, request.app.state.db, *project_names)
     tasks.add_task(delete_traces, request.app.state.db, *eval_trace_ids)
-    return Response(status_code=HTTP_204_NO_CONTENT, background=tasks)
-async def get_dataset_by_id(request: Request) -> Response:
-    """
-    summary: Get dataset by ID
-    operationId: getDatasetById
-    tags:
-      - datasets
-    parameters:
-      - in: path
-        name: id
-        required: true
-        schema:
-          type: string
-    responses:
-      200:
-        description: Success
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                id:
-                  type: string
-                name:
-                  type: string
-                description:
-                  type: string
-                metadata:
-                  type: object
-                created_at:
-                  type: string
-                  format: date-time
-                updated_at:
-                  type: string
-                  format: date-time
-                example_count:
-                  type: integer
-      403:
-        description: Forbidden
-      404:
-        description: Dataset not found
-    """
-    dataset_id = GlobalID.from_id(request.path_params["id"])
-    if (type_name := dataset_id.type_name) != NODE_NAME:
-        return Response(
-            content=f"ID {dataset_id} refers to a f{type_name}", status_code=HTTP_404_NOT_FOUND
+class DatasetWithExampleCount(Dataset):
+    example_count: int
+class GetDatasetResponseBody(ResponseBody[DatasetWithExampleCount]):
+    pass
+@router.get(
+    "/datasets/{id}",
+    operation_id="getDataset",
+    summary="Get dataset by ID",
+    responses=add_errors_to_responses([HTTP_404_NOT_FOUND]),
+)
+async def get_dataset(
+    request: Request, id: str = Path(description="The ID of the dataset")
+) -> GetDatasetResponseBody:
+    dataset_id = GlobalID.from_id(id)
+    if (type_name := dataset_id.type_name) != DATASET_NODE_NAME:
+        raise HTTPException(
+            detail=f"ID {dataset_id} refers to a f{type_name}", status_code=HTTP_404_NOT_FOUND
         )
     async with request.app.state.db() as session:
         result = await session.execute(
@@ -278,97 +226,64 @@ async def get_dataset_by_id(request: Request) -> Response:
         dataset = dataset_query[0] if dataset_query else None
         example_count = dataset_query[1] if dataset_query else 0
         if dataset is None:
-            return Response(
-                content=f"Dataset with ID {dataset_id} not found", status_code=HTTP_404_NOT_FOUND
+            raise HTTPException(
+                detail=f"Dataset with ID {dataset_id} not found", status_code=HTTP_404_NOT_FOUND
             )
-        output_dict = {
-            "id": str(dataset_id),
-            "name": dataset.name,
-            "description": dataset.description,
-            "metadata": dataset.metadata_,
-            "created_at": dataset.created_at.isoformat(),
-            "updated_at": dataset.updated_at.isoformat(),
-            "example_count": example_count,
-        }
-        return JSONResponse(content={"data": output_dict})
-async def get_dataset_versions(request: Request) -> Response:
-    """
-    summary: Get dataset versions (sorted from latest to oldest)
-    operationId: getDatasetVersionsByDatasetId
-    tags:
-      - datasets
-    parameters:
-      - in: path
-        name: id
-        required: true
-        description: Dataset ID
-        schema:
-          type: string
-      - in: query
-        name: cursor
-        description: Cursor for pagination.
-        schema:
-          type: string
-      - in: query
-        name: limit
-        description: Maximum number versions to return.
-        schema:
-          type: integer
-          default: 10
-    responses:
-      200:
-        description: Success
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                next_cursor:
-                  type: string
-                data:
-                  type: array
-                  items:
-                    type: object
-                    properties:
-                      version_id:
-                        type: string
-                      description:
-                        type: string
-                      metadata:
-                        type: object
-                      created_at:
-                        type: string
-                        format: date-time
-      403:
-        description: Forbidden
-      422:
-        description: Dataset ID, cursor or limit is invalid.
-    """
-    if id_ := request.path_params.get("id"):
+        dataset = DatasetWithExampleCount(
+            id=str(dataset_id),
+            name=dataset.name,
+            description=dataset.description,
+            metadata=dataset.metadata_,
+            created_at=dataset.created_at,
+            updated_at=dataset.updated_at,
+            example_count=example_count,
+        )
+        return GetDatasetResponseBody(data=dataset)
+class DatasetVersion(V1RoutesBaseModel):
+    version_id: str
+    description: Optional[str]
+    metadata: Dict[str, Any]
+    created_at: datetime
+class ListDatasetVersionsResponseBody(PaginatedResponseBody[DatasetVersion]):
+    pass
+@router.get(
+    "/datasets/{id}/versions",
+    operation_id="listDatasetVersionsByDatasetId",
+    summary="List dataset versions",
+    responses=add_errors_to_responses([HTTP_422_UNPROCESSABLE_ENTITY]),
+)
+async def list_dataset_versions(
+    request: Request,
+    id: str = Path(description="The ID of the dataset"),
+    cursor: Optional[str] = Query(
+        default=None,
+        description="Cursor for pagination",
+    ),
+    limit: int = Query(
+        default=10, description="The max number of dataset versions to return at a time", gt=0
+    ),
+) -> ListDatasetVersionsResponseBody:
+    if id:
         try:
             dataset_id = from_global_id_with_expected_type(
-                GlobalID.from_id(id_),
-                Dataset.__name__,
+                GlobalID.from_id(id),
+                DATASET_NODE_NAME,
             )
         except ValueError:
-            return Response(
-                content=f"Invalid Dataset ID: {id_}",
+            raise HTTPException(
+                detail=f"Invalid Dataset ID: {id}",
                 status_code=HTTP_422_UNPROCESSABLE_ENTITY,
             )
     else:
-        return Response(
-            content="Missing Dataset ID",
-            status_code=HTTP_422_UNPROCESSABLE_ENTITY,
-        )
-    try:
-        limit = int(request.query_params.get("limit", 10))
-        assert limit > 0
-    except (ValueError, AssertionError):
-        return Response(
-            content="Invalid limit parameter",
+        raise HTTPException(
+            detail="Missing Dataset ID",
             status_code=HTTP_422_UNPROCESSABLE_ENTITY,
         )
     stmt = (
@@ -377,15 +292,14 @@ async def get_dataset_versions(request: Request) -> Response:
         .order_by(models.DatasetVersion.id.desc())
         .limit(limit + 1)
     )
-    if cursor := request.query_params.get("cursor"):
+    if cursor:
         try:
             dataset_version_id = from_global_id_with_expected_type(
-                GlobalID.from_id(cursor),
-                DatasetVersion.__name__,
+                GlobalID.from_id(cursor), DATASET_VERSION_NODE_NAME
             )
         except ValueError:
-            return Response(
-                content=f"Invalid cursor: {cursor}",
+            raise HTTPException(
+                detail=f"Invalid cursor: {cursor}",
                 status_code=HTTP_422_UNPROCESSABLE_ENTITY,
             )
         max_dataset_version_id = (
@@ -396,102 +310,99 @@ async def get_dataset_versions(request: Request) -> Response:
         stmt = stmt.filter(models.DatasetVersion.id <= max_dataset_version_id)
     async with request.app.state.db() as session:
         data = [
-            {
-                "version_id": str(GlobalID(DatasetVersion.__name__, str(version.id))),
-                "description": version.description,
-                "metadata": version.metadata_,
-                "created_at": version.created_at.isoformat(),
-            }
+            DatasetVersion(
+                version_id=str(GlobalID(DATASET_VERSION_NODE_NAME, str(version.id))),
+                description=version.description,
+                metadata=version.metadata_,
+                created_at=version.created_at,
+            )
             async for version in await session.stream_scalars(stmt)
         ]
-    next_cursor = data.pop()["version_id"] if len(data) == limit + 1 else None
-    return JSONResponse(content={"next_cursor": next_cursor, "data": data})
-async def post_datasets_upload(request: Request) -> Response:
-    """
-    summary: Upload dataset as either JSON or file (CSV or PyArrow)
-    operationId: uploadDataset
-    tags:
-      - datasets
-    parameters:
-      - in: query
-        name: sync
-        description: If true, fulfill request synchronously and return JSON containing dataset_id
-        schema:
-          type: boolean
-    requestBody:
-      content:
-        application/json:
-          schema:
-            type: object
-            required:
-              - name
-              - inputs
-            properties:
-              action:
-                type: string
-                enum: [create, append]
-              name:
-                type: string
-              description:
-                type: string
-              inputs:
-                type: array
-                items:
-                  type: object
-              outputs:
-                type: array
-                items:
-                  type: object
-              metadata:
-                type: array
-                items:
-                  type: object
-        multipart/form-data:
-          schema:
-            type: object
-            required:
-              - name
-              - input_keys[]
-              - output_keys[]
-              - file
-            properties:
-              action:
-                type: string
-                enum: [create, append]
-              name:
-                type: string
-              description:
-                type: string
-              input_keys[]:
-                type: array
-                items:
-                  type: string
-                uniqueItems: true
-              output_keys[]:
-                type: array
-                items:
-                  type: string
-                uniqueItems: true
-              metadata_keys[]:
-                type: array
-                items:
-                  type: string
-                uniqueItems: true
-              file:
-                type: string
-                format: binary
-    responses:
-      200:
-        description: Success
-      403:
-        description: Forbidden
-      409:
-        description: Dataset of the same name already exists
-      422:
-        description: Request body is invalid
-    """
+    next_cursor = data.pop().version_id if len(data) == limit + 1 else None
+    return ListDatasetVersionsResponseBody(data=data, next_cursor=next_cursor)
+class UploadDatasetData(V1RoutesBaseModel):
+    dataset_id: str
+class UploadDatasetResponseBody(ResponseBody[UploadDatasetData]):
+    pass
+@router.post(
+    "/datasets/upload",
+    operation_id="uploadDataset",
+    summary="Upload dataset from JSON, CSV, or PyArrow",
+    responses=add_errors_to_responses(
+        [
+            {
+                "status_code": HTTP_409_CONFLICT,
+                "description": "Dataset of the same name already exists",
+            },
+            {"status_code": HTTP_422_UNPROCESSABLE_ENTITY, "description": "Invalid request body"},
+        ]
+    ),
+    # FastAPI cannot generate the request body portion of the OpenAPI schema for
+    # routes that accept multiple request content types, so we have to provide
+    # this part of the schema manually. For context, see
+    # https://github.com/tiangolo/fastapi/discussions/7786 and
+    # https://github.com/tiangolo/fastapi/issues/990
+    openapi_extra={
+        "requestBody": {
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "type": "object",
+                        "required": ["name", "inputs"],
+                        "properties": {
+                            "action": {"type": "string", "enum": ["create", "append"]},
+                            "name": {"type": "string"},
+                            "description": {"type": "string"},
+                            "inputs": {"type": "array", "items": {"type": "object"}},
+                            "outputs": {"type": "array", "items": {"type": "object"}},
+                            "metadata": {"type": "array", "items": {"type": "object"}},
+                        },
+                    }
+                },
+                "multipart/form-data": {
+                    "schema": {
+                        "type": "object",
+                        "required": ["name", "input_keys[]", "output_keys[]", "file"],
+                        "properties": {
+                            "action": {"type": "string", "enum": ["create", "append"]},
+                            "name": {"type": "string"},
+                            "description": {"type": "string"},
+                            "input_keys[]": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "uniqueItems": True,
+                            },
+                            "output_keys[]": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "uniqueItems": True,
+                            },
+                            "metadata_keys[]": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "uniqueItems": True,
+                            },
+                            "file": {"type": "string", "format": "binary"},
+                        },
+                    }
+                },
+            }
+        },
+    },
+)
+async def upload_dataset(
+    request: Request,
+    sync: bool = Query(
+        default=False,
+        description="If true, fulfill request synchronously and return JSON containing dataset_id.",
+    ),
+) -> Optional[UploadDatasetResponseBody]:
     request_content_type = request.headers["content-type"]
     examples: Union[Examples, Awaitable[Examples]]
     if request_content_type.startswith("application/json"):
@@ -500,15 +411,15 @@ async def post_datasets_upload(request: Request) -> Response:
                 _process_json, await request.json()
             )
         except ValueError as e:
-            return Response(
-                content=str(e),
+            raise HTTPException(
+                detail=str(e),
                 status_code=HTTP_422_UNPROCESSABLE_ENTITY,
             )
         if action is DatasetAction.CREATE:
             async with request.app.state.db() as session:
                 if await _check_table_exists(session, name):
-                    return Response(
-                        content=f"Dataset with the same name already exists: {name=}",
+                    raise HTTPException(
+                        detail=f"Dataset with the same name already exists: {name=}",
                         status_code=HTTP_409_CONFLICT,
                     )
     elif request_content_type.startswith("multipart/form-data"):
@@ -524,15 +435,15 @@ async def post_datasets_upload(request: Request) -> Response:
                     file,
                 ) = await _parse_form_data(form)
             except ValueError as e:
-                return Response(
-                    content=str(e),
+                raise HTTPException(
+                    detail=str(e),
                     status_code=HTTP_422_UNPROCESSABLE_ENTITY,
                 )
             if action is DatasetAction.CREATE:
                 async with request.app.state.db() as session:
                     if await _check_table_exists(session, name):
-                        return Response(
-                            content=f"Dataset with the same name already exists: {name=}",
+                        raise HTTPException(
+                            detail=f"Dataset with the same name already exists: {name=}",
                             status_code=HTTP_409_CONFLICT,
                         )
             content = await file.read()
@@ -548,13 +459,13 @@ async def post_datasets_upload(request: Request) -> Response:
             else:
                 assert_never(file_content_type)
         except ValueError as e:
-            return Response(
-                content=str(e),
+            raise HTTPException(
+                detail=str(e),
                 status_code=HTTP_422_UNPROCESSABLE_ENTITY,
             )
     else:
-        return Response(
-            content=str("Invalid request Content-Type"),
+        raise HTTPException(
+            detail="Invalid request Content-Type",
             status_code=HTTP_422_UNPROCESSABLE_ENTITY,
         )
     operation = cast(
@@ -567,19 +478,19 @@ async def post_datasets_upload(request: Request) -> Response:
             description=description,
         ),
     )
-    if request.query_params.get("sync") == "true":
+    if sync:
         async with request.app.state.db() as session:
             dataset_id = (await operation(session)).dataset_id
-        return JSONResponse(
-            content={"data": {"dataset_id": str(GlobalID(Dataset.__name__, str(dataset_id)))}}
+        return UploadDatasetResponseBody(
+            data=UploadDatasetData(dataset_id=str(GlobalID(Dataset.__name__, str(dataset_id))))
         )
     try:
         request.state.enqueue_operation(operation)
     except QueueFull:
         if isinstance(examples, Coroutine):
             examples.close()
-        return Response(status_code=HTTP_429_TOO_MANY_REQUESTS)
-    return Response()
+        raise HTTPException(detail="Too many requests.", status_code=HTTP_429_TOO_MANY_REQUESTS)
+    return None
 class FileContentType(Enum):
@@ -757,158 +668,255 @@ async def _parse_form_data(
     )
-async def get_dataset_csv(request: Request) -> Response:
-    """
-    summary: Download dataset examples as CSV text file
-    operationId: getDatasetCsv
-    tags:
-      - datasets
-    parameters:
-      - in: path
-        name: id
-        required: true
-        schema:
-          type: string
-        description: Dataset ID
-      - in: query
-        name: version_id
-        schema:
-          type: string
-        description: Dataset version ID. If omitted, returns the latest version.
-    responses:
-      200:
-        description: Success
-        content:
-          text/csv:
-            schema:
-              type: string
-              contentMediaType: text/csv
-              contentEncoding: gzip
-      403:
-        description: Forbidden
-      404:
-        description: Dataset does not exist.
-      422:
-        description: Dataset ID or version ID is invalid.
-    """
+class DatasetExample(V1RoutesBaseModel):
+    id: str
+    input: Dict[str, Any]
+    output: Dict[str, Any]
+    metadata: Dict[str, Any]
+    updated_at: datetime
+class ListDatasetExamplesData(V1RoutesBaseModel):
+    dataset_id: str
+    version_id: str
+    examples: List[DatasetExample]
+class ListDatasetExamplesResponseBody(ResponseBody[ListDatasetExamplesData]):
+    pass
+@router.get(
+    "/datasets/{id}/examples",
+    operation_id="getDatasetExamples",
+    summary="Get examples from a dataset",
+    responses=add_errors_to_responses([HTTP_404_NOT_FOUND]),
+)
+async def get_dataset_examples(
+    request: Request,
+    id: str = Path(description="The ID of the dataset"),
+    version_id: Optional[str] = Query(
+        default=None,
+        description=(
+            "The ID of the dataset version " "(if omitted, returns data from the latest version)"
+        ),
+    ),
+) -> ListDatasetExamplesResponseBody:
+    dataset_gid = GlobalID.from_id(id)
+    version_gid = GlobalID.from_id(version_id) if version_id else None
+    if (dataset_type := dataset_gid.type_name) != "Dataset":
+        raise HTTPException(
+            detail=f"ID {dataset_gid} refers to a {dataset_type}", status_code=HTTP_404_NOT_FOUND
+        )
+    if version_gid and (version_type := version_gid.type_name) != "DatasetVersion":
+        raise HTTPException(
+            detail=f"ID {version_gid} refers to a {version_type}", status_code=HTTP_404_NOT_FOUND
+        )
+    async with request.app.state.db() as session:
+        if (
+            resolved_dataset_id := await session.scalar(
+                select(models.Dataset.id).where(models.Dataset.id == int(dataset_gid.node_id))
+            )
+        ) is None:
+            raise HTTPException(
+                detail=f"No dataset with id {dataset_gid} can be found.",
+                status_code=HTTP_404_NOT_FOUND,
+            )
+        # Subquery to find the maximum created_at for each dataset_example_id
+        # timestamp tiebreaks are resolved by the largest id
+        partial_subquery = select(
+            func.max(models.DatasetExampleRevision.id).label("max_id"),
+        ).group_by(models.DatasetExampleRevision.dataset_example_id)
+        if version_gid:
+            if (
+                resolved_version_id := await session.scalar(
+                    select(models.DatasetVersion.id).where(
+                        and_(
+                            models.DatasetVersion.dataset_id == resolved_dataset_id,
+                            models.DatasetVersion.id == int(version_gid.node_id),
+                        )
+                    )
+                )
+            ) is None:
+                raise HTTPException(
+                    detail=f"No dataset version with id {version_id} can be found.",
+                    status_code=HTTP_404_NOT_FOUND,
+                )
+            # if a version_id is provided, filter the subquery to only include revisions from that
+            partial_subquery = partial_subquery.filter(
+                models.DatasetExampleRevision.dataset_version_id <= resolved_version_id
+            )
+        else:
+            if (
+                resolved_version_id := await session.scalar(
+                    select(func.max(models.DatasetVersion.id)).where(
+                        models.DatasetVersion.dataset_id == resolved_dataset_id
+                    )
+                )
+            ) is None:
+                raise HTTPException(
+                    detail="Dataset has no versions.",
+                    status_code=HTTP_404_NOT_FOUND,
+                )
+        subquery = partial_subquery.subquery()
+        # Query for the most recent example revisions that are not deleted
+        query = (
+            select(models.DatasetExample, models.DatasetExampleRevision)
+            .join(
+                models.DatasetExampleRevision,
+                models.DatasetExample.id == models.DatasetExampleRevision.dataset_example_id,
+            )
+            .join(
+                subquery,
+                (subquery.c.max_id == models.DatasetExampleRevision.id),
+            )
+            .filter(models.DatasetExample.dataset_id == resolved_dataset_id)
+            .filter(models.DatasetExampleRevision.revision_kind != "DELETE")
+            .order_by(models.DatasetExample.id.asc())
+        )
+        examples = [
+            DatasetExample(
+                id=str(GlobalID("DatasetExample", str(example.id))),
+                input=revision.input,
+                output=revision.output,
+                metadata=revision.metadata_,
+                updated_at=revision.created_at,
+            )
+            async for example, revision in await session.stream(query)
+        ]
+    return ListDatasetExamplesResponseBody(
+        data=ListDatasetExamplesData(
+            dataset_id=str(GlobalID("Dataset", str(resolved_dataset_id))),
+            version_id=str(GlobalID("DatasetVersion", str(resolved_version_id))),
+            examples=examples,
+        )
+    )
+@router.get(
+    "/datasets/{id}/csv",
+    operation_id="getDatasetCsv",
+    summary="Download dataset examples as CSV file",
+    response_class=StreamingResponse,
+    status_code=HTTP_200_OK,
+    responses={
+        **add_errors_to_responses([HTTP_422_UNPROCESSABLE_ENTITY]),
+        **add_text_csv_content_to_responses(HTTP_200_OK),
+    },
+)
+async def get_dataset_csv(
+    request: Request,
+    response: Response,
+    id: str = Path(description="The ID of the dataset"),
+    version_id: Optional[str] = Query(
+        default=None,
+        description=(
+            "The ID of the dataset version " "(if omitted, returns data from the latest version)"
+        ),
+    ),
+) -> Response:
     try:
-        dataset_name, examples = await _get_db_examples(request)
+        async with request.app.state.db() as session:
+            dataset_name, examples = await _get_db_examples(
+                session=session, id=id, version_id=version_id
+            )
     except ValueError as e:
-        return Response(content=str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY)
+        raise HTTPException(detail=str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY)
     content = await run_in_threadpool(_get_content_csv, examples)
     return Response(
         content=content,
         headers={
             "content-disposition": f'attachment; filename="{dataset_name}.csv"',
             "content-type": "text/csv",
-            "content-encoding": "gzip",
         },
     )
-async def get_dataset_jsonl_openai_ft(request: Request) -> Response:
-    """
-    summary: Download dataset examples as OpenAI Fine-Tuning JSONL file
-    operationId: getDatasetJSONLOpenAIFineTuning
-    tags:
-      - datasets
-    parameters:
-      - in: path
-        name: id
-        required: true
-        schema:
-          type: string
-        description: Dataset ID
-      - in: query
-        name: version_id
-        schema:
-          type: string
-        description: Dataset version ID. If omitted, returns the latest version.
-    responses:
-      200:
-        description: Success
-        content:
-          text/plain:
-            schema:
-              type: string
-              contentMediaType: text/plain
-              contentEncoding: gzip
-      403:
-        description: Forbidden
-      404:
-        description: Dataset does not exist.
-      422:
-        description: Dataset ID or version ID is invalid.
-    """
+@router.get(
+    "/datasets/{id}/jsonl/openai_ft",
+    operation_id="getDatasetJSONLOpenAIFineTuning",
+    summary="Download dataset examples as OpenAI fine-tuning JSONL file",
+    response_class=PlainTextResponse,
+    responses=add_errors_to_responses(
+        [
+            {
+                "status_code": HTTP_422_UNPROCESSABLE_ENTITY,
+                "description": "Invalid dataset or version ID",
+            }
+        ]
+    ),
+)
+async def get_dataset_jsonl_openai_ft(
+    request: Request,
+    response: Response,
+    id: str = Path(description="The ID of the dataset"),
+    version_id: Optional[str] = Query(
+        default=None,
+        description=(
+            "The ID of the dataset version " "(if omitted, returns data from the latest version)"
+        ),
+    ),
+) -> bytes:
     try:
-        dataset_name, examples = await _get_db_examples(request)
+        async with request.app.state.db() as session:
+            dataset_name, examples = await _get_db_examples(
+                session=session, id=id, version_id=version_id
+            )
     except ValueError as e:
-        return Response(content=str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY)
+        raise HTTPException(detail=str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY)
     content = await run_in_threadpool(_get_content_jsonl_openai_ft, examples)
-    return Response(
-        content=content,
-        headers={
-            "content-disposition": f'attachment; filename="{dataset_name}.jsonl"',
-            "content-type": "text/plain",
-            "content-encoding": "gzip",
-        },
-    )
+    response.headers["content-disposition"] = f'attachment; filename="{dataset_name}.jsonl"'
+    return content
-async def get_dataset_jsonl_openai_evals(request: Request) -> Response:
-    """
-    summary: Download dataset examples as OpenAI Evals JSONL file
-    operationId: getDatasetJSONLOpenAIEvals
-    tags:
-      - datasets
-    parameters:
-      - in: path
-        name: id
-        required: true
-        schema:
-          type: string
-        description: Dataset ID
-      - in: query
-        name: version_id
-        schema:
-          type: string
-        description: Dataset version ID. If omitted, returns the latest version.
-    responses:
-      200:
-        description: Success
-        content:
-          text/plain:
-            schema:
-              type: string
-              contentMediaType: text/plain
-              contentEncoding: gzip
-      403:
-        description: Forbidden
-      404:
-        description: Dataset does not exist.
-      422:
-        description: Dataset ID or version ID is invalid.
-    """
+@router.get(
+    "/datasets/{id}/jsonl/openai_evals",
+    operation_id="getDatasetJSONLOpenAIEvals",
+    summary="Download dataset examples as OpenAI evals JSONL file",
+    response_class=PlainTextResponse,
+    responses=add_errors_to_responses(
+        [
+            {
+                "status_code": HTTP_422_UNPROCESSABLE_ENTITY,
+                "description": "Invalid dataset or version ID",
+            }
+        ]
+    ),
+)
+async def get_dataset_jsonl_openai_evals(
+    request: Request,
+    response: Response,
+    id: str = Path(description="The ID of the dataset"),
+    version_id: Optional[str] = Query(
+        default=None,
+        description=(
+            "The ID of the dataset version " "(if omitted, returns data from the latest version)"
+        ),
+    ),
+) -> bytes:
     try:
-        dataset_name, examples = await _get_db_examples(request)
+        async with request.app.state.db() as session:
+            dataset_name, examples = await _get_db_examples(
+                session=session, id=id, version_id=version_id
+            )
     except ValueError as e:
-        return Response(content=str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY)
+        raise HTTPException(detail=str(e), status_code=HTTP_422_UNPROCESSABLE_ENTITY)
     content = await run_in_threadpool(_get_content_jsonl_openai_evals, examples)
-    return Response(
-        content=content,
-        headers={
-            "content-disposition": f'attachment; filename="{dataset_name}.jsonl"',
-            "content-type": "text/plain",
-            "content-encoding": "gzip",
-        },
-    )
+    response.headers["content-disposition"] = f'attachment; filename="{dataset_name}.jsonl"'
+    return content
 def _get_content_csv(examples: List[models.DatasetExampleRevision]) -> bytes:
     records = [
         {
             "example_id": GlobalID(
-                type_name=DatasetExample.__name__,
+                type_name=DatasetExampleNodeType.__name__,
                 node_id=str(ex.dataset_example_id),
             ),
             **{f"input_{k}": v for k, v in ex.input.items()},
@@ -917,7 +925,7 @@ def _get_content_csv(examples: List[models.DatasetExampleRevision]) -> bytes:
         }
         for ex in examples
     ]
-    return gzip.compress(pd.DataFrame.from_records(records).to_csv(index=False).encode())
+    return str(pd.DataFrame.from_records(records).to_csv(index=False)).encode()
 def _get_content_jsonl_openai_ft(examples: List[models.DatasetExampleRevision]) -> bytes:
@@ -938,7 +946,7 @@ def _get_content_jsonl_openai_ft(examples: List[models.DatasetExampleRevision])
             ).encode()
         )
     records.seek(0)
-    return gzip.compress(records.read())
+    return records.read()
 def _get_content_jsonl_openai_evals(examples: List[models.DatasetExampleRevision]) -> bytes:
@@ -965,18 +973,17 @@ def _get_content_jsonl_openai_evals(examples: List[models.DatasetExampleRevision
             ).encode()
         )
     records.seek(0)
-    return gzip.compress(records.read())
+    return records.read()
-async def _get_db_examples(request: Request) -> Tuple[str, List[models.DatasetExampleRevision]]:
-    if not (id_ := request.path_params.get("id")):
-        raise ValueError("Missing Dataset ID")
-    dataset_id = from_global_id_with_expected_type(GlobalID.from_id(id_), Dataset.__name__)
+async def _get_db_examples(
+    *, session: Any, id: str, version_id: Optional[str]
+) -> Tuple[str, List[models.DatasetExampleRevision]]:
+    dataset_id = from_global_id_with_expected_type(GlobalID.from_id(id), DATASET_NODE_NAME)
     dataset_version_id: Optional[int] = None
-    if version_id := request.query_params.get("version_id"):
+    if version_id:
         dataset_version_id = from_global_id_with_expected_type(
-            GlobalID.from_id(version_id),
-            DatasetVersion.__name__,
+            GlobalID.from_id(version_id), DATASET_VERSION_NODE_NAME
         )
     latest_version = (
         select(
@@ -1009,13 +1016,12 @@ async def _get_db_examples(request: Request) -> Tuple[str, List[models.DatasetEx
         .where(models.DatasetExampleRevision.revision_kind != "DELETE")
         .order_by(models.DatasetExampleRevision.dataset_example_id)
     )
-    async with request.app.state.db() as session:
-        dataset_name: Optional[str] = await session.scalar(
-            select(models.Dataset.name).where(models.Dataset.id == dataset_id)
-        )
-        if not dataset_name:
-            raise ValueError("Dataset does not exist.")
-        examples = [r async for r in await session.stream_scalars(stmt)]
+    dataset_name: Optional[str] = await session.scalar(
+        select(models.Dataset.name).where(models.Dataset.id == dataset_id)
+    )
+    if not dataset_name:
+        raise ValueError("Dataset does not exist.")
+    examples = [r async for r in await session.stream_scalars(stmt)]
     return dataset_name, examples

arize-phoenix 4.14.1__py3-none-any.whl → 4.16.0__py3-none-any.whl

Potentially problematic release.

arize-phoenix 4.14.1py3-none-any.whl → 4.16.0py3-none-any.whl