PyPI - guidellm - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0rc20250418__py3-none-any.whl - Mend

guidellm 0.1.0py3-none-any.whl → 0.2.0rc20250418py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (69) hide show

guidellm/__init__.py +38 -6
guidellm/__main__.py +294 -0
guidellm/backend/__init__.py +19 -6
guidellm/backend/backend.py +238 -0
guidellm/backend/openai.py +532 -122
guidellm/backend/response.py +132 -0
guidellm/benchmark/__init__.py +73 -0
guidellm/benchmark/aggregator.py +760 -0
guidellm/benchmark/benchmark.py +838 -0
guidellm/benchmark/benchmarker.py +334 -0
guidellm/benchmark/entrypoints.py +141 -0
guidellm/benchmark/output.py +946 -0
guidellm/benchmark/profile.py +409 -0
guidellm/benchmark/progress.py +720 -0
guidellm/config.py +34 -56
guidellm/data/__init__.py +4 -0
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +22 -0
guidellm/dataset/creator.py +213 -0
guidellm/dataset/entrypoints.py +42 -0
guidellm/dataset/file.py +90 -0
guidellm/dataset/hf_datasets.py +62 -0
guidellm/dataset/in_memory.py +132 -0
guidellm/dataset/synthetic.py +262 -0
guidellm/objects/__init__.py +18 -0
guidellm/objects/pydantic.py +60 -0
guidellm/objects/statistics.py +947 -0
guidellm/request/__init__.py +12 -10
guidellm/request/loader.py +281 -0
guidellm/request/request.py +79 -0
guidellm/scheduler/__init__.py +51 -3
guidellm/scheduler/result.py +137 -0
guidellm/scheduler/scheduler.py +382 -0
guidellm/scheduler/strategy.py +493 -0
guidellm/scheduler/types.py +7 -0
guidellm/scheduler/worker.py +511 -0
guidellm/utils/__init__.py +16 -29
guidellm/utils/colors.py +8 -0
guidellm/utils/hf_transformers.py +35 -0
guidellm/utils/random.py +43 -0
guidellm/utils/text.py +118 -357
{guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/METADATA +96 -79
guidellm-0.2.0rc20250418.dist-info/RECORD +48 -0
{guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/WHEEL +1 -1
guidellm-0.2.0rc20250418.dist-info/entry_points.txt +2 -0
guidellm/backend/base.py +0 -320
guidellm/core/__init__.py +0 -24
guidellm/core/distribution.py +0 -190
guidellm/core/report.py +0 -321
guidellm/core/request.py +0 -44
guidellm/core/result.py +0 -545
guidellm/core/serializable.py +0 -169
guidellm/executor/__init__.py +0 -10
guidellm/executor/base.py +0 -213
guidellm/executor/profile_generator.py +0 -343
guidellm/main.py +0 -336
guidellm/request/base.py +0 -194
guidellm/request/emulated.py +0 -391
guidellm/request/file.py +0 -76
guidellm/request/transformers.py +0 -100
guidellm/scheduler/base.py +0 -374
guidellm/scheduler/load_generator.py +0 -196
guidellm/utils/injector.py +0 -70
guidellm/utils/progress.py +0 -196
guidellm/utils/transformers.py +0 -151
guidellm-0.1.0.dist-info/RECORD +0 -35
guidellm-0.1.0.dist-info/entry_points.txt +0 -3
{guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info/licenses}/LICENSE +0 -0
{guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/top_level.txt +0 -0

guidellm/utils/text.py CHANGED Viewed

@@ -1,60 +1,76 @@
-import csv
-import json
+import gzip
 import re
+import textwrap
+from importlib.resources import as_file, files  # type: ignore[attr-defined]
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-from urllib.parse import urlparse
+from typing import Any, Optional, Union
 import ftfy
-import requests
-import yaml
+import httpx
 from loguru import logger
+from guidellm import data as package_data
 from guidellm.config import settings
 __all__ = [
-    "clean_text",
+    "split_text_list_by_length",
     "filter_text",
-    "is_path",
-    "is_path_like",
-    "is_url",
-    "load_text",
-    "load_text_lines",
-    "parse_text_objects",
-    "split_lines_by_punctuation",
+    "clean_text",
     "split_text",
+    "load_text",
+    "is_puncutation",
+    "EndlessTextCreator",
 ]
-NAME_TITLES = [
-    "Mr.",
-    "Mrs.",
-    "Ms.",
-    "Dr.",
-    "Prof.",
-    "Jr.",
-    "Sr.",
-    "St.",
-    "Lt.",
-    "Col.",
-    "Gen.",
-    "Rep.",
-    "Sen.",
-    "Gov.",
-    "Pres.",
-]
-SENTENCE_REGEX = r'[^.!?]*[.!?]["\']?\s*(?=[A-Z])'
-MAX_EXTENSION_LENGTH = 8
 MAX_PATH_LENGTH = 4096
-EXTENSION_TYPES = {
-    "csv": "csv",
-    "jsonl": "jsonl",
-    "json": "json",
-    "yaml": "yaml",
-    "yml": "yaml",
-    "txt": "txt",
-    "text": "txt",
-}
+def split_text_list_by_length(
+    text_list: list[Any],
+    max_characters: Union[int, list[int]],
+    pad_horizontal: bool = True,
+    pad_vertical: bool = True,
+) -> list[list[str]]:
+    """
+    Split a list of strings into a list of strings,
+    each with a maximum length of max_characters
+    :param text_list: the list of strings to split
+    :param max_characters: the maximum length of each string
+    :param pad_horizontal: whether to pad the strings horizontally, defaults to True
+    :param pad_vertical: whether to pad the strings vertically, defaults to True
+    :return: a list of strings
+    """
+    if not isinstance(max_characters, list):
+        max_characters = [max_characters] * len(text_list)
+    if len(max_characters) != len(text_list):
+        raise ValueError(
+            f"max_characters must be a list of the same length as text_list, "
+            f"but got {len(max_characters)} and {len(text_list)}"
+        )
+    result: list[list[str]] = []
+    for index, text in enumerate(text_list):
+        lines = textwrap.wrap(text, max_characters[index])
+        result.append(lines)
+    if pad_vertical:
+        max_lines = max(len(lines) for lines in result)
+        for lines in result:
+            while len(lines) < max_lines:
+                lines.append(" ")
+    if pad_horizontal:
+        for index in range(len(result)):
+            lines = result[index]
+            max_chars = max_characters[index]
+            new_lines = []
+            for line in lines:
+                new_lines.append(line.rjust(max_chars))
+            result[index] = new_lines
+    return result
 def filter_text(
@@ -95,216 +111,17 @@ def filter_text(
     return text
-def clean_text(
-    text: str,
-    fix_encoding: bool = True,
-    clean_whitespace: bool = False,
-    remove_empty_lines: bool = False,
-    force_new_line_punctuation: bool = False,
-) -> str:
-    """
-    Clean text by fixing encoding, cleaning whitespace, removing empty lines,
-    and forcing new line punctuation
-    :param text: the text to clean
-    :param fix_encoding: True to fix the encoding of the text, False to leave as is
-    :param clean_whitespace: True to clean the whitespace in the text
-        (remove extra spaces, tabs, etc), False to leave as is
-    :param remove_empty_lines: True to remove empty lines from the text
-        (lines with only whitespace), False to leave as is
-    :param force_new_line_punctuation: True to force new lines at punctuation
-        (line ends in a period, exclamation point, or question mark),
-        False to leave as is
-    :return: The cleaned text
-    """
-    if fix_encoding:
-        text = ftfy.fix_text(text)
-    if clean_whitespace:
-        text = "\n".join(
-            [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
-        )
-    if remove_empty_lines:
-        text = "\n".join([line for line in text.splitlines() if line.strip()])
-    if force_new_line_punctuation:
-        # first remove any existing new lines
-        text = " ".join(line for line in text.splitlines() if line.strip())
-        lines = split_lines_by_punctuation(text)
-        text = "\n".join(lines)
-    return text
-def split_lines_by_punctuation(text: str) -> List[str]:
-    """
-    Split text into lines based on punctuation
-    :param text: the text to split
-    :return: the list of lines
-    """
-    lines = []
-    current_line = ""
-    skip_next = False
-    for index, char in enumerate(text):
-        if skip_next:
-            skip_next = False
-            continue
-        current_line += char
-        if char not in [".", "!", "?"]:
-            # must match end of sentence punctuation
-            continue
-        # if this is the character for a title, don't split
-        if any(current_line.endswith(title) for title in NAME_TITLES):
-            continue
-        char_next_1 = text[index + 1] if index + 1 < len(text) else None
-        char_next_2 = text[index + 2] if index + 2 < len(text) else None
-        char_next_3 = text[index + 3] if index + 3 < len(text) else None
-        next_is_space = char_next_1 and char_next_1.isspace()
-        next_is_quote_and_space = char_next_1 in ["'", '"'] and char_next_2 == " "
-        # next character must be a space or a quote, otherwise skip
-        if not next_is_space and not next_is_quote_and_space:
-            continue
-        # after this, next character must be an upper case letter
-        upper_char = char_next_3 if next_is_quote_and_space else char_next_2
-        next_is_upper = upper_char and (
-            upper_char.isupper() or upper_char in ["'", '"']
-        )
-        if not next_is_upper:
-            continue
-        # if next char is a quote, add it and skip next
-        if next_is_quote_and_space:
-            current_line += text[index + 1]
-            skip_next = True
-        lines.append(current_line.strip())
-        current_line = ""
+def clean_text(text: str) -> str:
+    return re.sub(r"\s+", " ", ftfy.fix_text(text)).strip()
-    if current_line:
-        lines.append(current_line.strip())
-    return lines
+def split_text(text: str, split_punctuation: bool = False) -> list[str]:
+    text = clean_text(text)
+    if split_punctuation:
+        return re.findall(r"[\w]+|[.,!?;]", text)
-def is_url(url: str) -> bool:
-    """
-    Check if a string is a URL
-    :param url: the string to check
-    :return: True if the string is a URL, False if not
-    """
-    try:
-        result = urlparse(url)
-        return all([result.scheme, result.netloc])
-    except Exception:  # noqa: BLE001
-        return False
-def is_path(path: Any) -> bool:
-    """
-    Check if a string is a path
-    :param path: the string to check
-    :return: True if the string is a path, False if not
-    """
-    if not isinstance(path, (str, Path)):
-        return False
-    if isinstance(path, str):
-        path = Path(path)
-    return path.exists()
-def is_path_like(path: Any, enforce_file: bool = False) -> bool:
-    """
-    Check if a string has a path like structure where it doesn't need to exist
-    :param path: the string to check
-    :param enforce_file: True if the path should be a file, False if not
-    :return: True if the string is path like, False if not
-    """
-    # if path isn't a str or Path, it's not a path
-    if not isinstance(path, (str, Path)):
-        return False
-    if isinstance(path, Path):
-        path = str(path)
-    # if text is too long, it's not a path (4096 for most linux setups)
-    if len(path) > MAX_PATH_LENGTH:
-        return False
-    # if it starts with a URL scheme, it's not a path
-    if path.startswith(("http", "ftp")):
-        return False
-    test_path = Path(path)
-    # if it's supposed to be a file and there's no extension or
-    # the extension is too long, it's not a path
-    return not enforce_file or (
-        bool(test_path.suffix) and len(test_path.suffix) <= MAX_EXTENSION_LENGTH
-    )
-def split_text(text: str) -> Tuple[List[str], List[str], List[int]]:
-    """
-    Split text into words / tokens, the white space separators between words,
-    and the indices for each new line
-    :param text: the text to split
-    :return: the words, the white space separators, and the new line indices
-    """
-    if not text or not text.strip():
-        return [], [], []
-    text = text.strip()
-    tokens = []  # type: List[str]
-    separators = []  # type: List[str]
-    new_lines = [0]
-    buffer = text[0]
-    is_token = not text[0].isspace()
-    for char in text[1:]:
-        char_whitespace = char.isspace()
-        if char == "\n":
-            new_lines.append(len(tokens) + 1)
-        if char_whitespace and is_token:
-            tokens.append(buffer)
-            buffer = char
-            is_token = False
-        elif char_whitespace:
-            buffer += char
-        elif not char_whitespace and not is_token:
-            separators.append(buffer)
-            buffer = char
-            is_token = True
-        else:
-            buffer += char
-    if buffer and is_token:
-        tokens.append(buffer)
-        separators.append(" ")
-    elif buffer:
-        separators.append(buffer)
-    return tokens, separators, new_lines
+    return text.split()
 def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
@@ -324,132 +141,76 @@ def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
         return ""
     # check URLs
-    if isinstance(data, str) and data.startswith("http"):
-        response = requests.get(data, timeout=settings.request_timeout)
-        response.raise_for_status()
-        return response.text
-    # check raw text
-    if isinstance(data, str) and not is_path_like(data, enforce_file=True):
+    if isinstance(data, str) and data.strip().startswith(("http", "ftp")):
+        with httpx.Client(timeout=settings.request_timeout) as client:
+            response = client.get(data.strip())
+            response.raise_for_status()
+            return response.text
+    # check package data
+    if isinstance(data, str) and data.startswith("data:"):
+        resource_path = files(package_data).joinpath(data[5:])
+        with (
+            as_file(resource_path) as resource_file,
+            gzip.open(resource_file, "rt", encoding=encoding) as file,
+        ):
+            return file.read()
+    # check gzipped files
+    if isinstance(data, str) and data.endswith(".gz"):
+        with gzip.open(data, "rt", encoding=encoding) as file:
+            return file.read()
+    # check if it's raw text by not being a path
+    if isinstance(data, str) and (
+        len(data) > MAX_PATH_LENGTH or not Path(data).exists()
+    ):
         return data
     # assume local file
     if not isinstance(data, Path):
         data = Path(data)
-    if not data.exists():
+    if not data.exists() or not data.is_file():
         raise FileNotFoundError(f"File not found: {data}")
-    if not data.is_file():
-        raise IsADirectoryError(f"Path is a directory: {data}")
     return data.read_text(encoding=encoding)
-def parse_text_objects(data: str, format_: str = "txt") -> List[Dict]:
-    """
-    Parse text data into a list of dictionaries based on the format given
-    (csv, jsonl, json, yaml, txt).
-    :param data: the text data to parse
-    :param format_: the format of the data to parse:
-        'csv', 'jsonl', 'json', 'yaml', 'txt'
-    :return: the list of dictionaries parsed from the data, if text
-        then each line is a dictionary with a single key 'text'
+def is_puncutation(text: str) -> bool:
     """
-    if not isinstance(data, str):
-        raise ValueError(f"Unsupported data given of type: {type(data)}")
-    if format_ == "csv":
-        reader = csv.DictReader(data.splitlines())
-        columns = reader.fieldnames
-        return [{col: row[col] for col in columns} for row in reader]  # type: ignore # noqa: PGH003
+    Check if the text is a punctuation
-    if format_ == "jsonl":
-        return [json.loads(line) for line in data.splitlines() if line]
-    if format_ in ("json", "yaml"):
-        data = json.loads(data) if format_ == "json" else yaml.safe_load(data)
-        if not data:
-            return []
-        if isinstance(data, dict) and len(data) == 1:
-            logger.debug("Getting first value from JSON/YAML object: {}", data)
-            data = list(data.values())[0]
-        elif isinstance(data, dict):
-            logger.debug("Converting JSON/YAML object to list: {}", data)
-            data = list(data.values())
-        if not isinstance(data, list) or not isinstance(data[0], dict):
-            raise ValueError(f"Unsupported data structure given: {data}")
-        return data
-    if format_ == "txt":
-        return [{"text": line} for line in data.splitlines() if line]
-    raise ValueError(f"Unsupported format given: {format_}")
-def load_text_lines(
-    data: Union[str, Path, List[Dict]],
-    format_: Optional[str] = None,
-    filters: Optional[List[str]] = None,
-    encoding: Optional[str] = None,
-) -> List[str]:
+    :param text: the text to check
+    :type text: str
+    :return: True if the text is a punctuation, False otherwise
+    :rtype: bool
     """
-    Load text lines from a file or data object with optional filtering and formatting.
-    :param data: the data to load the text lines from
-    :param format_: the format of the data to load, if not provided will be inferred.
-        Supported formats: 'csv', 'jsonl', 'json', 'yaml', 'txt'
-    :param filters: the keys to filter the data by when loading in order of preference.
-        If not provided, will use the first key in the data object.
-    :param encoding: the encoding to use when reading the file
-    :return: the list of text lines
-    """
-    logger.debug(
-        "Loading text lines with format {}, filters {}, encoding {} for data: {}",
-        format_,
-        filters,
-        encoding,
-        data,
-    )
-    if not data:
-        return []
-    if not format_ and isinstance(data, (str, Path)) and "." in str(data):
-        extension = str(data).split(".")[-1]
-        format_ = EXTENSION_TYPES.get(extension, "txt")
-    elif not format_:
-        format_ = "txt"
+    return len(text) == 1 and not text.isalnum() and not text.isspace()
-    # load the data if it's a path or URL
-    if isinstance(data, Path) or (isinstance(data, str) and data.startswith("http")):
-        data = load_text(data, encoding=encoding)
-        data = clean_text(data)
-    # parse the data into a list of dictionaries based on the format
-    if isinstance(data, str):
-        data = parse_text_objects(data, format_)
+class EndlessTextCreator:
+    def __init__(
+        self,
+        data: Union[str, Path],
+        filter_start: Optional[Union[str, int]] = None,
+        filter_end: Optional[Union[str, int]] = None,
+    ):
+        self.data = data
+        self.text = load_text(data)
+        self.filtered_text = filter_text(self.text, filter_start, filter_end)
+        self.words = split_text(self.filtered_text, split_punctuation=True)
-    if not isinstance(data, list):
-        raise ValueError(f"Unsupported data given of type: {type(data)}")
+    def create_text(self, start: int, length: int) -> str:
+        text = ""
-    if not isinstance(data[0], dict):
-        raise ValueError(f"Unsupported data item type given: {type(data[0])}")
+        for counter in range(length):
+            index = (start + counter) % len(self.words)
+            add_word = self.words[index]
-    # grab the first available filter key to use if preference order as provided
-    filter_ = list(data[0].keys())[0]
-    for filt in filters or []:
-        if filt not in data[0]:
-            continue
+            if counter != 0 and not is_puncutation(add_word):
+                text += " "
-        filter_ = filt
-        break
+            text += add_word
-    # extract the lines from the data
-    return [row[filter_] for row in data] if filter_ else [str(row) for row in data]
+        return text

guidellm 0.1.0__py3-none-any.whl → 0.2.0rc20250418__py3-none-any.whl

Potentially problematic release.

guidellm 0.1.0py3-none-any.whl → 0.2.0rc20250418py3-none-any.whl