PyPI - aind-data-transfer-service - Versions diffs - 1.12.0__py3-none-any.whl - Mend

aind-data-transfer-service 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of aind-data-transfer-service might be problematic. Click here for more details.

Files changed (23) hide show

aind_data_transfer_service/__init__.py +9 -0
aind_data_transfer_service/configs/__init__.py +1 -0
aind_data_transfer_service/configs/csv_handler.py +59 -0
aind_data_transfer_service/configs/job_configs.py +545 -0
aind_data_transfer_service/configs/job_upload_template.py +153 -0
aind_data_transfer_service/hpc/__init__.py +1 -0
aind_data_transfer_service/hpc/client.py +151 -0
aind_data_transfer_service/hpc/models.py +492 -0
aind_data_transfer_service/log_handler.py +58 -0
aind_data_transfer_service/models/__init__.py +1 -0
aind_data_transfer_service/models/core.py +300 -0
aind_data_transfer_service/models/internal.py +277 -0
aind_data_transfer_service/server.py +1125 -0
aind_data_transfer_service/templates/index.html +245 -0
aind_data_transfer_service/templates/job_params.html +194 -0
aind_data_transfer_service/templates/job_status.html +323 -0
aind_data_transfer_service/templates/job_tasks_table.html +146 -0
aind_data_transfer_service/templates/task_logs.html +31 -0
aind_data_transfer_service-1.12.0.dist-info/METADATA +49 -0
aind_data_transfer_service-1.12.0.dist-info/RECORD +23 -0
aind_data_transfer_service-1.12.0.dist-info/WHEEL +5 -0
aind_data_transfer_service-1.12.0.dist-info/licenses/LICENSE +21 -0
aind_data_transfer_service-1.12.0.dist-info/top_level.txt +1 -0

aind_data_transfer_service/models/core.py ADDED Viewed

@@ -0,0 +1,300 @@
+"""Core models for using V2 of aind-data-transfer-service"""
+import json
+from contextlib import contextmanager
+from contextvars import ContextVar
+from datetime import datetime
+from typing import Any, Dict, List, Literal, Optional, Set, Union
+from aind_data_schema_models.data_name_patterns import build_data_name
+from aind_data_schema_models.modalities import Modality
+from aind_data_schema_models.platforms import Platform
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    EmailStr,
+    Field,
+    ValidationInfo,
+    computed_field,
+    field_validator,
+    model_validator,
+)
+from pydantic_settings import BaseSettings
+_validation_context: ContextVar[Union[Dict[str, Any], None]] = ContextVar(
+    "_validation_context", default=None
+)
+@contextmanager
+def validation_context(context: Union[Dict[str, Any], None]) -> None:
+    """
+    Following guide in:
+    https://docs.pydantic.dev/latest/concepts/validators/#validation-context
+    Parameters
+    ----------
+    context : Union[Dict[str, Any], None]
+    Returns
+    -------
+    None
+    """
+    token = _validation_context.set(context)
+    try:
+        yield
+    finally:
+        _validation_context.reset(token)
+class Task(BaseModel):
+    """Configuration for a task run during a data transfer upload job."""
+    skip_task: bool = Field(
+        default=False,
+        description=(
+            "Skip running this task. If true, only task_id and skip_step are "
+            "required."
+        ),
+        title="Skip Step",
+    )
+    image: Optional[str] = Field(
+        default=None, description="Name of docker image to run", title="Image"
+    )
+    image_version: Optional[str] = Field(
+        default=None,
+        description="Version of docker image to run",
+        title="Image Version",
+    )
+    image_resources: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Slurm environment. Must be json serializable.",
+        title="Image Resources",
+    )
+    job_settings: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Settings for the job.",
+        title="Job Settings",
+    )
+    command_script: Optional[str] = Field(
+        default=None,
+        description=(
+            """
+            Command script to run. A few strings may be replaced:
+            %JOB_SETTINGS: This will be replaced with json.dumps(job_settings)
+            %OUTPUT_LOCATION: Output location such as a local directory
+            %S3_LOCATION: Location of S3 where to upload data to
+            %INPUT_SOURCE: If a job requires a dynamic input source,
+             then this may be replaced.
+            %IMAGE: The containerized image.
+            %IMAGE_VERSION: The image version.
+            %ENV_FILE: An environment file location, such as aws configs.
+            """
+        ),
+    )
+    @field_validator(
+        "image_resources",
+        "job_settings",
+        mode="after",
+    )
+    def validate_json_serializable(
+        cls, v: Optional[Dict[str, Any]], info: ValidationInfo
+    ) -> Optional[Dict[str, Any]]:
+        """Validate that fields are json serializable."""
+        if v is not None:
+            try:
+                json.dumps(v)
+            except Exception as e:
+                raise ValueError(
+                    f"{info.field_name} must be json serializable! If "
+                    f"converting from a Pydantic model, please use "
+                    f'model.model_dump(mode="json"). {e}'
+                )
+        return v
+class UploadJobConfigsV2(BaseSettings):
+    """Configuration for a data transfer upload job"""
+    # noinspection PyMissingConstructor
+    def __init__(self, /, **data: Any) -> None:
+        """Add context manager to init for validating fields."""
+        self.__pydantic_validator__.validate_python(
+            data,
+            self_instance=self,
+            context=_validation_context.get(),
+        )
+    model_config = ConfigDict(use_enum_values=True, extra="ignore")
+    job_type: str = Field(
+        default="default",
+        description=(
+            "Job type for the upload job. Tasks will be run based on the "
+            "job_type unless otherwise specified in task_overrides."
+        ),
+        title="Job Type",
+    )
+    user_email: Optional[EmailStr] = Field(
+        default=None,
+        description=(
+            "Optional email address to receive job status notifications"
+        ),
+    )
+    email_notification_types: Optional[
+        Set[Literal["begin", "end", "fail", "retry", "all"]]
+    ] = Field(
+        default=None,
+        description=(
+            "Types of job statuses to receive email notifications about"
+        ),
+    )
+    s3_bucket: Literal["private", "open", "default"] = Field(
+        default="default",
+        description=(
+            "Bucket where data will be uploaded. If not provided, will upload "
+            "to default bucket."
+        ),
+        title="S3 Bucket",
+    )
+    project_name: str = Field(
+        ..., description="Name of project", title="Project Name"
+    )
+    platform: Platform.ONE_OF = Field(
+        ..., description="Platform", title="Platform"
+    )
+    modalities: List[Modality.ONE_OF] = Field(
+        ...,
+        description="Data collection modalities",
+        title="Modalities",
+        min_length=1,
+    )
+    subject_id: str = Field(..., description="Subject ID", title="Subject ID")
+    acq_datetime: datetime = Field(
+        ...,
+        description="Datetime data was acquired",
+        title="Acquisition Datetime",
+    )
+    tasks: Dict[str, Union[Task, Dict[str, Task]]] = Field(
+        ...,
+        description=(
+            "Dictionary of tasks to run with custom settings. The key must be "
+            "the task_id and the value must be the task or list of tasks."
+        ),
+        title="Tasks",
+    )
+    @computed_field
+    def s3_prefix(self) -> str:
+        """Construct s3_prefix from configs."""
+        return build_data_name(
+            label=f"{self.platform.abbreviation}_{self.subject_id}",
+            creation_datetime=self.acq_datetime,
+        )
+    @field_validator("job_type", "project_name", mode="before")
+    def validate_with_context(cls, v: str, info: ValidationInfo) -> str:
+        """
+        Validate certain fields. If a list of accepted values is provided in a
+        context manager, then it will validate against the list. Otherwise, it
+        won't raise any validation error.
+        Parameters
+        ----------
+        v : str
+          Value input into the field.
+        info : ValidationInfo
+        Returns
+        -------
+        str
+        """
+        valid_list = (info.context or dict()).get(f"{info.field_name}s")
+        if valid_list is not None and v not in valid_list:
+            raise ValueError(f"{v} must be one of {valid_list}")
+        else:
+            return v
+class SubmitJobRequestV2(BaseSettings):
+    """Main request that will be sent to the backend. Bundles jobs into a list
+    and allows a user to add an email address to receive notifications."""
+    # noinspection PyMissingConstructor
+    def __init__(self, /, **data: Any) -> None:
+        """Add context manager to init for validating upload_jobs."""
+        self.__pydantic_validator__.validate_python(
+            data,
+            self_instance=self,
+            context=_validation_context.get(),
+        )
+    model_config = ConfigDict(use_enum_values=True, extra="ignore")
+    dag_id: Literal["transform_and_upload_v2"] = "transform_and_upload_v2"
+    user_email: Optional[EmailStr] = Field(
+        default=None,
+        description=(
+            "Optional email address to receive job status notifications"
+        ),
+    )
+    email_notification_types: Set[
+        Literal["begin", "end", "fail", "retry", "all"]
+    ] = Field(
+        default={"fail"},
+        description=(
+            "Types of job statuses to receive email notifications about"
+        ),
+    )
+    upload_jobs: List[UploadJobConfigsV2] = Field(
+        ...,
+        description="List of upload jobs to process. Max of 50 at a time.",
+        min_length=1,
+        max_length=50,
+    )
+    @model_validator(mode="after")
+    def propagate_email_settings(self):
+        """Propagate email settings from global to individual jobs"""
+        global_email_user = self.user_email
+        global_email_notification_types = self.email_notification_types
+        for upload_job in self.upload_jobs:
+            if global_email_user is not None and upload_job.user_email is None:
+                upload_job.user_email = global_email_user
+            if upload_job.email_notification_types is None:
+                upload_job.email_notification_types = (
+                    global_email_notification_types
+                )
+        return self
+    @model_validator(mode="after")
+    def check_duplicate_upload_jobs(self, info: ValidationInfo):
+        """Validate that there are no duplicate upload jobs. If a list of
+        current jobs is provided in a context manager, jobs are also checked
+        against the list."""
+        jobs_map = dict()
+        # check jobs with the same s3_prefix
+        for job in self.upload_jobs:
+            prefix = job.s3_prefix
+            job_json = json.dumps(
+                job.model_dump(mode="json", exclude_none=True), sort_keys=True
+            )
+            jobs_map.setdefault(prefix, set())
+            if job_json in jobs_map[prefix]:
+                raise ValueError(f"Duplicate jobs found for {prefix}")
+            jobs_map[prefix].add(job_json)
+        # check against any jobs in the context
+        current_jobs = (info.context or dict()).get("current_jobs", list())
+        for job in current_jobs:
+            prefix = job.get("s3_prefix")
+            if (
+                prefix is not None
+                and prefix in jobs_map
+                and json.dumps(job, sort_keys=True) in jobs_map[prefix]
+            ):
+                raise ValueError(f"Job is already running/queued for {prefix}")
+        return self

aind_data_transfer_service/models/internal.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Module for internal data models used in application"""
+import ast
+import os
+from datetime import datetime, timedelta, timezone
+from typing import List, Optional, Union
+from mypy_boto3_ssm.type_defs import ParameterMetadataTypeDef
+from pydantic import AwareDatetime, BaseModel, Field, field_validator
+from starlette.datastructures import QueryParams
+class AirflowDagRun(BaseModel):
+    """Data model for dag_run entry when requesting info from airflow"""
+    conf: Optional[dict]
+    dag_id: Optional[str]
+    dag_run_id: Optional[str]
+    data_interval_end: Optional[AwareDatetime]
+    data_interval_start: Optional[AwareDatetime]
+    end_date: Optional[AwareDatetime]
+    execution_date: Optional[AwareDatetime]
+    external_trigger: Optional[bool]
+    last_scheduling_decision: Optional[AwareDatetime]
+    logical_date: Optional[AwareDatetime]
+    note: Optional[str]
+    run_type: Optional[str]
+    start_date: Optional[AwareDatetime]
+    state: Optional[str]
+class AirflowDagRunsResponse(BaseModel):
+    """Data model for response when requesting info from dag_runs endpoint"""
+    dag_runs: List[AirflowDagRun]
+    total_entries: int
+class AirflowDagRunsRequestParameters(BaseModel):
+    """Model for parameters when requesting info from dag_runs endpoint"""
+    dag_ids: list[str] = ["transform_and_upload", "transform_and_upload_v2"]
+    page_limit: int = 100
+    page_offset: int = 0
+    states: Optional[list[str]] = []
+    execution_date_gte: Optional[str] = (
+        datetime.now(timezone.utc) - timedelta(weeks=2)
+    ).isoformat()
+    execution_date_lte: Optional[str] = None
+    order_by: str = "-execution_date"
+    @field_validator("execution_date_gte", mode="after")
+    def validate_min_execution_date(cls, execution_date_gte: str):
+        """Validate the earliest submit date filter is within 2 weeks"""
+        min_execution_date = datetime.now(timezone.utc) - timedelta(weeks=2)
+        # datetime.fromisoformat does not support Z in python < 3.11
+        date_to_check = execution_date_gte.replace("Z", "+00:00")
+        if datetime.fromisoformat(date_to_check) < min_execution_date:
+            raise ValueError(
+                "execution_date_gte must be within the last 2 weeks"
+            )
+        return execution_date_gte
+    @classmethod
+    def from_query_params(cls, query_params: QueryParams):
+        """Maps the query parameters to the model"""
+        params = dict(query_params)
+        if "states" in params:
+            params["states"] = ast.literal_eval(params["states"])
+        return cls.model_validate(params)
+class AirflowTaskInstancesRequestParameters(BaseModel):
+    """Model for parameters when requesting info from task_instances
+    endpoint"""
+    dag_id: str = Field(..., min_length=1)
+    dag_run_id: str = Field(..., min_length=1)
+    @classmethod
+    def from_query_params(cls, query_params: QueryParams):
+        """Maps the query parameters to the model"""
+        params = dict(query_params)
+        return cls.model_validate(params)
+class AirflowTaskInstance(BaseModel):
+    """Data model for task_instance entry when requesting info from airflow"""
+    dag_id: Optional[str]
+    dag_run_id: Optional[str]
+    duration: Optional[Union[int, float]]
+    end_date: Optional[AwareDatetime]
+    execution_date: Optional[AwareDatetime]
+    executor_config: Optional[str]
+    hostname: Optional[str]
+    map_index: Optional[int]
+    max_tries: Optional[int]
+    note: Optional[str]
+    operator: Optional[str]
+    pid: Optional[int]
+    pool: Optional[str]
+    pool_slots: Optional[int]
+    priority_weight: Optional[int]
+    queue: Optional[str]
+    queued_when: Optional[AwareDatetime]
+    rendered_fields: Optional[dict]
+    sla_miss: Optional[dict]
+    start_date: Optional[AwareDatetime]
+    state: Optional[str]
+    task_id: Optional[str]
+    trigger: Optional[dict]
+    triggerer_job: Optional[dict]
+    try_number: Optional[int]
+    unixname: Optional[str]
+class AirflowTaskInstancesResponse(BaseModel):
+    """Data model for response when requesting info from task_instances
+    endpoint"""
+    task_instances: List[AirflowTaskInstance]
+    total_entries: int
+class AirflowTaskInstanceLogsRequestParameters(BaseModel):
+    """Model for parameters when requesting info from task_instance_logs
+    endpoint"""
+    # excluded fields are used to build the url
+    dag_id: str = Field(..., min_length=1, exclude=True)
+    dag_run_id: str = Field(..., min_length=1, exclude=True)
+    task_id: str = Field(..., min_length=1, exclude=True)
+    try_number: int = Field(..., ge=0, exclude=True)
+    map_index: int = Field(..., ge=-1)
+    full_content: bool = True
+    @classmethod
+    def from_query_params(cls, query_params: QueryParams):
+        """Maps the query parameters to the model"""
+        params = dict(query_params)
+        return cls.model_validate(params)
+class JobStatus(BaseModel):
+    """Model for what we want to render to the user."""
+    dag_id: Optional[str] = Field(None)
+    end_time: Optional[datetime] = Field(None)
+    job_id: Optional[str] = Field(None)
+    job_state: Optional[str] = Field(None)
+    name: Optional[str] = Field(None)
+    job_type: Optional[str] = Field(None)
+    comment: Optional[str] = Field(None)
+    start_time: Optional[datetime] = Field(None)
+    submit_time: Optional[datetime] = Field(None)
+    @classmethod
+    def from_airflow_dag_run(cls, airflow_dag_run: AirflowDagRun):
+        """Maps the fields from the HpcJobStatusResponse to this model"""
+        name = airflow_dag_run.conf.get("s3_prefix", "")
+        job_type = airflow_dag_run.conf.get("job_type", "")
+        # v1 job_type is in CO configs
+        if job_type == "":
+            job_type = airflow_dag_run.conf.get("codeocean_configs", {}).get(
+                "job_type", ""
+            )
+        return cls(
+            dag_id=airflow_dag_run.dag_id,
+            end_time=airflow_dag_run.end_date,
+            job_id=airflow_dag_run.dag_run_id,
+            job_state=airflow_dag_run.state,
+            name=name,
+            job_type=job_type,
+            comment=airflow_dag_run.note,
+            start_time=airflow_dag_run.start_date,
+            submit_time=airflow_dag_run.execution_date,
+        )
+    @property
+    def jinja_dict(self):
+        """Map model to a dictionary that jinja can render"""
+        return self.model_dump(exclude_none=True)
+class JobTasks(BaseModel):
+    """Model for what is rendered to the user for each task."""
+    dag_id: Optional[str] = Field(None)
+    job_id: Optional[str] = Field(None)
+    task_id: Optional[str] = Field(None)
+    try_number: Optional[int] = Field(None)
+    task_state: Optional[str] = Field(None)
+    priority_weight: Optional[int] = Field(None)
+    map_index: Optional[int] = Field(None)
+    submit_time: Optional[datetime] = Field(None)
+    start_time: Optional[datetime] = Field(None)
+    end_time: Optional[datetime] = Field(None)
+    duration: Optional[Union[int, float]] = Field(None)
+    comment: Optional[str] = Field(None)
+    @classmethod
+    def from_airflow_task_instance(
+        cls, airflow_task_instance: AirflowTaskInstance
+    ):
+        """Maps the fields from the HpcJobStatusResponse to this model"""
+        return cls(
+            dag_id=airflow_task_instance.dag_id,
+            job_id=airflow_task_instance.dag_run_id,
+            task_id=airflow_task_instance.task_id,
+            try_number=airflow_task_instance.try_number,
+            task_state=airflow_task_instance.state,
+            priority_weight=airflow_task_instance.priority_weight,
+            map_index=airflow_task_instance.map_index,
+            submit_time=airflow_task_instance.execution_date,
+            start_time=airflow_task_instance.start_date,
+            end_time=airflow_task_instance.end_date,
+            duration=airflow_task_instance.duration,
+            comment=airflow_task_instance.note,
+        )
+class JobParamInfo(BaseModel):
+    """Model for job parameter info from AWS Parameter Store"""
+    name: Optional[str]
+    last_modified: Optional[datetime]
+    job_type: str
+    task_id: str
+    modality: Optional[str]
+    @classmethod
+    def from_aws_describe_parameter(
+        cls,
+        parameter: ParameterMetadataTypeDef,
+        job_type: str,
+        task_id: str,
+        modality: Optional[str],
+    ):
+        """Map the parameter to the model"""
+        return cls(
+            name=parameter.get("Name"),
+            last_modified=parameter.get("LastModifiedDate"),
+            job_type=job_type,
+            task_id=task_id,
+            modality=modality,
+        )
+    @staticmethod
+    def get_parameter_prefix(version: Optional[str] = None) -> str:
+        """Get the prefix for job_type parameters"""
+        prefix = os.getenv("AIND_AIRFLOW_PARAM_PREFIX")
+        if version is None:
+            return prefix
+        return f"{prefix}/{version}"
+    @staticmethod
+    def get_parameter_regex(version: Optional[str] = None) -> str:
+        """Create the regex pattern to match the parameter name"""
+        prefix = os.getenv("AIND_AIRFLOW_PARAM_PREFIX")
+        regex = (
+            "(?P<job_type>[^/]+)/tasks/(?P<task_id>[^/]+)"
+            "(?:/(?P<modality>[^/]+))?"
+        )
+        if version is None:
+            return f"{prefix}/{regex}"
+        return f"{prefix}/{version}/{regex}"
+    @staticmethod
+    def get_parameter_name(
+        job_type: str, task_id: str, version: Optional[str] = None
+    ) -> str:
+        """Create the parameter name from job_type and task_id"""
+        prefix = os.getenv("AIND_AIRFLOW_PARAM_PREFIX")
+        if version is None:
+            return f"{prefix}/{job_type}/tasks/{task_id}"
+        return f"{prefix}/{version}/{job_type}/tasks/{task_id}"