PyPI - seekrai - Versions diffs - 0.0.1__py3-none-any.whl - Mend

seekrai 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

seekrai/__init__.py +64 -0
seekrai/abstract/__init__.py +1 -0
seekrai/abstract/api_requestor.py +710 -0
seekrai/cli/__init__.py +0 -0
seekrai/cli/api/__init__.py +0 -0
seekrai/cli/api/chat.py +245 -0
seekrai/cli/api/completions.py +107 -0
seekrai/cli/api/files.py +125 -0
seekrai/cli/api/finetune.py +175 -0
seekrai/cli/api/images.py +82 -0
seekrai/cli/api/models.py +42 -0
seekrai/cli/cli.py +77 -0
seekrai/client.py +154 -0
seekrai/constants.py +32 -0
seekrai/error.py +188 -0
seekrai/filemanager.py +393 -0
seekrai/legacy/__init__.py +0 -0
seekrai/legacy/base.py +27 -0
seekrai/legacy/complete.py +91 -0
seekrai/legacy/embeddings.py +25 -0
seekrai/legacy/files.py +140 -0
seekrai/legacy/finetune.py +173 -0
seekrai/legacy/images.py +25 -0
seekrai/legacy/models.py +44 -0
seekrai/resources/__init__.py +25 -0
seekrai/resources/chat/__init__.py +24 -0
seekrai/resources/chat/completions.py +241 -0
seekrai/resources/completions.py +205 -0
seekrai/resources/embeddings.py +100 -0
seekrai/resources/files.py +173 -0
seekrai/resources/finetune.py +425 -0
seekrai/resources/images.py +156 -0
seekrai/resources/models.py +75 -0
seekrai/seekrflow_response.py +50 -0
seekrai/types/__init__.py +67 -0
seekrai/types/abstract.py +26 -0
seekrai/types/chat_completions.py +151 -0
seekrai/types/common.py +64 -0
seekrai/types/completions.py +86 -0
seekrai/types/embeddings.py +35 -0
seekrai/types/error.py +16 -0
seekrai/types/files.py +88 -0
seekrai/types/finetune.py +218 -0
seekrai/types/images.py +42 -0
seekrai/types/models.py +43 -0
seekrai/utils/__init__.py +28 -0
seekrai/utils/_log.py +61 -0
seekrai/utils/api_helpers.py +84 -0
seekrai/utils/files.py +204 -0
seekrai/utils/tools.py +75 -0
seekrai/version.py +6 -0
seekrai-0.0.1.dist-info/LICENSE +201 -0
seekrai-0.0.1.dist-info/METADATA +401 -0
seekrai-0.0.1.dist-info/RECORD +56 -0
seekrai-0.0.1.dist-info/WHEEL +4 -0
seekrai-0.0.1.dist-info/entry_points.txt +3 -0

seekrai/types/finetune.py ADDED Viewed

@@ -0,0 +1,218 @@
+from __future__ import annotations
+from enum import Enum
+from typing import List, Literal
+from pydantic import Field
+from seekrai.types.abstract import BaseModel
+from seekrai.types.common import (
+    ObjectType,
+)
+from datetime import datetime
+class FinetuneJobStatus(str, Enum):
+    """
+    Possible fine-tune job status
+    """
+    STATUS_PENDING = "pending"
+    STATUS_QUEUED = "queued"
+    STATUS_RUNNING = "running"
+    # STATUS_COMPRESSING = "compressing"
+    # STATUS_UPLOADING = "uploading"
+    STATUS_CANCEL_REQUESTED = "cancel_requested"
+    STATUS_CANCELLED = "cancelled"
+    STATUS_FAILED = "failed"
+    STATUS_COMPLETED = "completed"
+class FinetuneEventLevels(str, Enum):
+    """
+    Fine-tune job event status levels
+    """
+    NULL = ""
+    INFO = "Info"
+    WARNING = "Warning"
+    ERROR = "Error"
+    LEGACY_INFO = "info"
+    LEGACY_IWARNING = "warning"
+    LEGACY_IERROR = "error"
+class FinetuneEventType(str, Enum):
+    """
+    Fine-tune job event types
+    """
+    JOB_PENDING = "JOB_PENDING"
+    JOB_START = "JOB_START"
+    JOB_STOPPED = "JOB_STOPPED"
+    MODEL_DOWNLOADING = "MODEL_DOWNLOADING"
+    MODEL_DOWNLOAD_COMPLETE = "MODEL_DOWNLOAD_COMPLETE"
+    TRAINING_DATA_DOWNLOADING = "TRAINING_DATA_DOWNLOADING"
+    TRAINING_DATA_DOWNLOAD_COMPLETE = "TRAINING_DATA_DOWNLOAD_COMPLETE"
+    VALIDATION_DATA_DOWNLOADING = "VALIDATION_DATA_DOWNLOADING"
+    VALIDATION_DATA_DOWNLOAD_COMPLETE = "VALIDATION_DATA_DOWNLOAD_COMPLETE"
+    WANDB_INIT = "WANDB_INIT"
+    TRAINING_START = "TRAINING_START"
+    CHECKPOINT_SAVE = "CHECKPOINT_SAVE"
+    BILLING_LIMIT = "BILLING_LIMIT"
+    EPOCH_COMPLETE = "EPOCH_COMPLETE"
+    TRAINING_COMPLETE = "TRAINING_COMPLETE"
+    MODEL_COMPRESSING = "COMPRESSING_MODEL"
+    MODEL_COMPRESSION_COMPLETE = "MODEL_COMPRESSION_COMPLETE"
+    MODEL_UPLOADING = "MODEL_UPLOADING"
+    MODEL_UPLOAD_COMPLETE = "MODEL_UPLOAD_COMPLETE"
+    JOB_COMPLETE = "JOB_COMPLETE"
+    JOB_ERROR = "JOB_ERROR"
+    CANCEL_REQUESTED = "CANCEL_REQUESTED"
+    JOB_RESTARTED = "JOB_RESTARTED"
+    REFUND = "REFUND"
+    WARNING = "WARNING"
+class FinetuneEvent(BaseModel):
+    """
+    Fine-tune event type
+    """
+    # object type
+    object: Literal[ObjectType.FinetuneEvent]
+    # created at datetime stamp
+    created_at: str | None = None
+    # event log level
+    level: FinetuneEventLevels | None = None
+    # event message string
+    message: str | None = None
+    # event type
+    type: FinetuneEventType | None = None
+    # optional: model parameter count
+    param_count: int | None = None
+    # optional: dataset token count
+    token_count: int | None = None
+    # optional: weights & biases url
+    wandb_url: str | None = None
+    # event hash
+    hash: str | None = None
+class TrainingConfig(BaseModel):
+    # training file ID
+    training_files: List[str]
+    # base model string
+    model: str
+    # number of epochs to train for
+    n_epochs: int
+    # training learning rate
+    learning_rate: float
+    # number of checkpoints to save
+    n_checkpoints: int | None = None
+    # training batch size
+    batch_size: int | None = None
+    # up to 40 character suffix for output model name
+    experiment_name: str | None = None
+    # # weights & biases api key
+    # wandb_key: str | None = None
+class InfrastructureConfig(BaseModel):
+    n_cpu: int
+    n_gpu: int
+class FinetuneRequest(BaseModel):
+    """
+    Fine-tune request type
+    """
+    training_config: TrainingConfig
+    infrastructure_config: InfrastructureConfig
+class FinetuneResponse(BaseModel):
+    """
+    Fine-tune API response type
+    """
+    # job ID
+    id: str | None = None
+    # training file id
+    training_files: List[str] | None = None
+    # validation file id
+    # validation_files: str | None = None TODO
+    # base model name
+    model: str | None = None
+    # number of epochs
+    # n_epochs: int | None = None
+    # number of checkpoints to save
+    # n_checkpoints: int | None = None # TODO
+    # training batch size
+    # batch_size: int | None = None
+    # training learning rate
+    # learning_rate: float | None = None
+    # number of steps between evals
+    # eval_steps: int | None = None TODO
+    # is LoRA finetune boolean
+    # lora: bool | None = None
+    # lora_r: int | None = None
+    # lora_alpha: int | None = None
+    # lora_dropout: int | None = None
+    # created/updated datetime stamps
+    created_at: datetime | None = None
+    # updated_at: str | None = None
+    # job status
+    status: FinetuneJobStatus | None = None
+    # list of fine-tune events
+    events: List[FinetuneEvent] | None = None
+    inference_available: bool  = False
+    # dataset token count
+    # TODO
+    # token_count: int | None = None
+    # # model parameter count
+    # param_count: int | None = None
+    # # fine-tune job price
+    # total_price: int | None = None
+    # # number of epochs completed (incrementing counter)
+    # epochs_completed: int | None = None
+    # # place in job queue (decrementing counter)
+    # queue_depth: int | None = None
+    # # weights & biases project name
+    # wandb_project_name: str | None = None
+    # # weights & biases job url
+    # wandb_url: str | None = None
+    # # training file metadata
+    # training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
+    # training_file_size: int | None = Field(None, alias="TrainingFileSize")
+class FinetuneList(BaseModel):
+    # object type
+    object: Literal["list"] | None = None
+    # list of fine-tune job objects
+    data: List[FinetuneResponse] | None = None
+class FinetuneListEvents(BaseModel):
+    # object type
+    object: Literal["list"] | None = None
+    # list of fine-tune events
+    data: List[FinetuneEvent] | None = None
+class FinetuneDownloadResult(BaseModel):
+    # object type
+    object: Literal["local"] | None = None
+    # fine-tune job id
+    id: str | None = None
+    # checkpoint step number
+    checkpoint_step: int | None = None
+    # local path filename
+    filename: str | None = None
+    # size in bytes
+    size: int | None = None

seekrai/types/images.py ADDED Viewed

@@ -0,0 +1,42 @@
+from __future__ import annotations
+from typing import List, Literal
+from seekrai.types.abstract import BaseModel
+class ImageRequest(BaseModel):
+    # input or list of inputs
+    prompt: str
+    # model to query
+    model: str
+    # num generation steps
+    steps: int | None = 20
+    # seed
+    seed: int | None = None
+    # number of results to return
+    n: int | None = 1
+    # pixel height
+    height: int | None = 1024
+    # pixel width
+    width: int | None = 1024
+    # negative prompt
+    negative_prompt: str | None = None
+class ImageChoicesData(BaseModel):
+    # response index
+    index: int
+    # base64 image response
+    b64_json: str
+class ImageResponse(BaseModel):
+    # job id
+    id: str | None = None
+    # query model
+    model: str | None = None
+    # object type
+    object: Literal["list"] | None = None
+    # list of embedding choices
+    data: List[ImageChoicesData] | None = None

seekrai/types/models.py ADDED Viewed

@@ -0,0 +1,43 @@
+from __future__ import annotations
+from enum import Enum
+from typing import Literal
+from seekrai.types.abstract import BaseModel
+from seekrai.types.common import ObjectType
+class ModelType(str, Enum):
+    CHAT = "chat"
+    LANGUAGE = "language"
+    CODE = "code"
+    IMAGE = "image"
+    EMBEDDING = "embedding"
+    MODERATION = "moderation"
+class PricingObject(BaseModel):
+    input: float | None = None
+    output: float | None = None
+    hourly: float | None = None
+    base: float | None = None
+    finetune: float | None = None
+class ModelObject(BaseModel):
+    # model id
+    id: str
+    # object type
+    object: Literal[ObjectType.Model]
+    created: int | None = None
+    # model type
+    type: ModelType | None = None
+    # pretty name
+    display_name: str | None = None
+    # model creator organization
+    organization: str | None = None
+    # link to model resource
+    link: str | None = None
+    license: str | None = None
+    context_length: int | None = None
+    pricing: PricingObject

seekrai/utils/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from seekrai.utils._log import log_debug, log_info, log_warn, logfmt
+from seekrai.utils.api_helpers import default_api_key, get_headers
+from seekrai.utils.files import check_file
+from seekrai.utils.tools import (
+    convert_bytes,
+    convert_unix_timestamp,
+    enforce_trailing_slash,
+    finetune_price_to_dollars,
+    normalize_key,
+    parse_timestamp,
+)
+__all__ = [
+    "check_file",
+    "get_headers",
+    "default_api_key",
+    "log_debug",
+    "log_info",
+    "log_warn",
+    "logfmt",
+    "enforce_trailing_slash",
+    "normalize_key",
+    "parse_timestamp",
+    "finetune_price_to_dollars",
+    "convert_bytes",
+    "convert_unix_timestamp",
+]

seekrai/utils/_log.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+import logging
+import os
+import re
+import sys
+from typing import Any, Dict
+import seekrai
+logger = logging.getLogger("seekrai")
+SEEKRFLOW_LOG = os.environ.get("SEEKRFLOW_LOG")
+def _console_log_level() -> str | None:
+    if seekrai.log in ["debug", "info"]:
+        return seekrai.log
+    elif SEEKRFLOW_LOG in ["debug", "info"]:
+        return SEEKRFLOW_LOG
+    else:
+        return None
+def logfmt(props: Dict[str, Any]) -> str:
+    def fmt(key: str, val: Any) -> str:
+        # Handle case where val is a bytes or bytesarray
+        if hasattr(val, "decode"):
+            val = val.decode("utf-8")
+        # Check if val is already a string to avoid re-encoding into ascii.
+        if not isinstance(val, str):
+            val = str(val)
+        if re.search(r"\s", val):
+            val = repr(val)
+        # key should already be a string
+        if re.search(r"\s", key):
+            key = repr(key)
+        return "{key}={val}".format(key=key, val=val)
+    return " ".join([fmt(key, val) for key, val in sorted(props.items())])
+def log_debug(message: str | Any, **params: Any) -> None:
+    msg = logfmt(dict(message=message, **params))
+    if _console_log_level() == "debug":
+        print(msg, file=sys.stderr)
+    logger.debug(msg)
+def log_info(message: str | Any, **params: Any) -> None:
+    msg = logfmt(dict(message=message, **params))
+    if _console_log_level() in ["debug", "info"]:
+        print(msg, file=sys.stderr)
+    logger.info(msg)
+def log_warn(message: str | Any, **params: Any) -> None:
+    msg = logfmt(dict(message=message, **params))
+    print(msg, file=sys.stderr)
+    logger.warn(msg)

seekrai/utils/api_helpers.py ADDED Viewed

@@ -0,0 +1,84 @@
+from __future__ import annotations
+import json
+import os
+import platform
+from typing import TYPE_CHECKING, Any, Dict
+if TYPE_CHECKING:
+    from _typeshed import SupportsKeysAndGetItem
+import seekrai
+from seekrai import error
+from seekrai.utils._log import _console_log_level
+def get_headers(
+    method: str | None = None,
+    api_key: str | None = None,
+    extra: "SupportsKeysAndGetItem[str, Any] | None" = None,
+) -> Dict[str, str]:
+    """
+    Generates request headers with API key, metadata, and supplied headers
+    Args:
+        method (str, optional): HTTP request type (POST, GET, etc.)
+            Defaults to None.
+        api_key (str, optional): API key to add as an Authorization header.
+            Defaults to None.
+        extra (SupportsKeysAndGetItem[str, Any], optional): Additional headers to add to request.
+            Defaults to None.
+    Returns:
+        headers (Dict[str, str]): Compiled headers from data
+    """
+    user_agent = "SeekrFlow/v1 PythonBindings/%s" % (seekrai.version,)
+    uname_without_node = " ".join(
+        v for k, v in platform.uname()._asdict().items() if k != "node"
+    )
+    ua = {
+        "bindings_version": seekrai.version,
+        "httplib": "requests",
+        "lang": "python",
+        "lang_version": platform.python_version(),
+        "platform": platform.platform(),
+        "publisher": "seekrai",
+        "uname": uname_without_node,
+    }
+    headers: Dict[str, Any] = {
+        "X-SeekrFlow-Client-User-Agent": json.dumps(ua),
+        "Authorization": default_api_key(api_key),
+        "User-Agent": user_agent,
+    }
+    if _console_log_level():
+        headers["SeekrFlow-Debug"] = _console_log_level()
+    if extra:
+        headers.update(extra)
+    return headers
+def default_api_key(api_key: str | None = None) -> str | None:
+    """
+    API key fallback logic from input argument and environment variable
+    Args:
+        api_key (str, optional): Supplied API key. This argument takes priority over env var
+    Returns:
+        seekrflow_api_key (str): Returns API key from supplied input or env var
+    Raises:
+        seekrai.error.AuthenticationError: if API key not found
+    """
+    if api_key:
+        return api_key
+    if os.environ.get("SEEKRFLOW_API_KEY"):
+        return os.environ.get("SEEKRFLOW_API_KEY")
+    raise error.AuthenticationError(seekrai.constants.MISSING_API_KEY_MESSAGE)

seekrai/utils/files.py ADDED Viewed

@@ -0,0 +1,204 @@
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from traceback import format_exc
+from typing import Any, Dict
+from pyarrow import ArrowInvalid, parquet
+from seekrai.constants import (
+    MAX_FILE_SIZE_GB,
+    MIN_SAMPLES,
+    NUM_BYTES_IN_GB,
+    PARQUET_EXPECTED_COLUMNS,
+)
+def check_file(
+    file: Path | str,
+) -> Dict[str, Any]:
+    if not isinstance(file, Path):
+        file = Path(file)
+    report_dict = {
+        "is_check_passed": True,
+        "message": "Checks passed",
+        "found": None,
+        "file_size": None,
+        "utf8": None,
+        "line_type": None,
+        "text_field": None,
+        "key_value": None,
+        "min_samples": None,
+        "num_samples": None,
+        "load_json": None,
+    }
+    if not file.is_file():
+        report_dict["found"] = False
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["found"] = True
+    file_size = os.stat(file.as_posix()).st_size
+    if file_size > MAX_FILE_SIZE_GB * NUM_BYTES_IN_GB:
+        report_dict["message"] = (
+            f"Maximum supported file size is {MAX_FILE_SIZE_GB} GB. Found file with size of {round(file_size / NUM_BYTES_IN_GB ,3)} GB."
+        )
+        report_dict["is_check_passed"] = False
+    elif file_size == 0:
+        report_dict["message"] = "File is empty"
+        report_dict["file_size"] = 0
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["file_size"] = file_size
+    if file.suffix == ".jsonl":
+        report_dict["filetype"] = "jsonl"
+        data_report_dict = _check_jsonl(file)
+    elif file.suffix == ".parquet":
+        report_dict["filetype"] = "parquet"
+        data_report_dict = _check_parquet(file)
+    else:
+        report_dict["filetype"] = (
+            f"Unknown extension of file {file}. "
+            "Only files with extensions .jsonl and .parquet are supported."
+        )
+        report_dict["is_check_passed"] = False
+    report_dict.update(data_report_dict)
+    return report_dict
+def _check_jsonl(file: Path) -> Dict[str, Any]:
+    report_dict: Dict[str, Any] = {}
+    # Check that the file is UTF-8 encoded. If not report where the error occurs.
+    try:
+        with file.open(encoding="utf-8") as f:
+            f.read()
+        report_dict["utf8"] = True
+    except UnicodeDecodeError as e:
+        report_dict["utf8"] = False
+        report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}."
+        report_dict["is_check_passed"] = False
+        return report_dict
+    with file.open() as f:
+        # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
+        idx = -1
+        try:
+            for idx, line in enumerate(f):
+                json_line = json.loads(line)  # each line in jsonlines should be a json
+                if not isinstance(json_line, dict):
+                    report_dict["line_type"] = False
+                    report_dict["message"] = (
+                        f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
+                        'Example of valid json: {"text": "my sample string"}. '
+                    )
+                    report_dict["is_check_passed"] = False
+                if "text" not in json_line.keys():
+                    report_dict["text_field"] = False
+                    report_dict["message"] = (
+                        f"Missing 'text' field was found on line {idx + 1} of the the input file. "
+                        "Expected format: {'text': 'my sample string'}. "
+                    )
+                    report_dict["is_check_passed"] = False
+                else:
+                    # check to make sure the value of the "text" key is a string
+                    if not isinstance(json_line["text"], str):
+                        report_dict["key_value"] = False
+                        report_dict["message"] = (
+                            f'Invalid value type for "text" key on line {idx + 1}. '
+                            f'Expected string. Found {type(json_line["text"])}.'
+                        )
+                        report_dict["is_check_passed"] = False
+            # make sure this is outside the for idx, line in enumerate(f): for loop
+            if idx + 1 < MIN_SAMPLES:
+                report_dict["min_samples"] = False
+                report_dict["message"] = (
+                    f"Processing {file} resulted in only {idx + 1} samples. "
+                    f"Our minimum is {MIN_SAMPLES} samples. "
+                )
+                report_dict["is_check_passed"] = False
+            else:
+                report_dict["num_samples"] = idx + 1
+                report_dict["min_samples"] = True
+            report_dict["load_json"] = True
+        except ValueError:
+            report_dict["load_json"] = False
+            if idx < 0:
+                report_dict["message"] = (
+                    "Unable to decode file. "
+                    "File may be empty or in an unsupported format. "
+                )
+            else:
+                report_dict["message"] = (
+                    f"Error parsing json payload. Unexpected format on line {idx + 1}."
+                )
+            report_dict["is_check_passed"] = False
+    if "text_field" not in report_dict:
+        report_dict["text_field"] = True
+    if "line_type" not in report_dict:
+        report_dict["line_type"] = True
+    if "key_value" not in report_dict:
+        report_dict["key_value"] = True
+    return report_dict
+def _check_parquet(file: Path) -> Dict[str, Any]:
+    report_dict: Dict[str, Any] = {}
+    try:
+        table = parquet.read_table(str(file), memory_map=True)
+    except ArrowInvalid:
+        report_dict["load_parquet"] = (
+            f"An exception has occurred when loading the Parquet file {file}. Please check the file for corruption. "
+            f"Exception trace:\n{format_exc()}"
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    column_names = table.schema.names
+    if "input_ids" not in column_names:
+        report_dict["load_parquet"] = (
+            f"Parquet file {file} does not contain the `input_ids` column."
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    for column_name in column_names:
+        if column_name not in PARQUET_EXPECTED_COLUMNS:
+            report_dict["load_parquet"] = (
+                f"Parquet file {file} contains an unexpected column {column_name}. "
+                f"Only columns {PARQUET_EXPECTED_COLUMNS} are supported."
+            )
+            report_dict["is_check_passed"] = False
+            return report_dict
+    num_samples = len(table)
+    if num_samples < MIN_SAMPLES:
+        report_dict["min_samples"] = (
+            f"Processing {file} resulted in only {num_samples} samples. "
+            f"Our minimum is {MIN_SAMPLES} samples. "
+        )
+        report_dict["is_check_passed"] = False
+        return report_dict
+    else:
+        report_dict["num_samples"] = num_samples
+    report_dict["is_check_passed"] = True
+    return report_dict