guidellm 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (69) hide show
  1. guidellm/__init__.py +38 -6
  2. guidellm/__main__.py +294 -0
  3. guidellm/backend/__init__.py +19 -6
  4. guidellm/backend/backend.py +238 -0
  5. guidellm/backend/openai.py +532 -122
  6. guidellm/backend/response.py +132 -0
  7. guidellm/benchmark/__init__.py +73 -0
  8. guidellm/benchmark/aggregator.py +760 -0
  9. guidellm/benchmark/benchmark.py +838 -0
  10. guidellm/benchmark/benchmarker.py +334 -0
  11. guidellm/benchmark/entrypoints.py +141 -0
  12. guidellm/benchmark/output.py +946 -0
  13. guidellm/benchmark/profile.py +409 -0
  14. guidellm/benchmark/progress.py +720 -0
  15. guidellm/config.py +34 -56
  16. guidellm/data/__init__.py +4 -0
  17. guidellm/data/prideandprejudice.txt.gz +0 -0
  18. guidellm/dataset/__init__.py +22 -0
  19. guidellm/dataset/creator.py +213 -0
  20. guidellm/dataset/entrypoints.py +42 -0
  21. guidellm/dataset/file.py +90 -0
  22. guidellm/dataset/hf_datasets.py +62 -0
  23. guidellm/dataset/in_memory.py +132 -0
  24. guidellm/dataset/synthetic.py +262 -0
  25. guidellm/objects/__init__.py +18 -0
  26. guidellm/objects/pydantic.py +60 -0
  27. guidellm/objects/statistics.py +947 -0
  28. guidellm/request/__init__.py +12 -10
  29. guidellm/request/loader.py +281 -0
  30. guidellm/request/request.py +79 -0
  31. guidellm/scheduler/__init__.py +51 -3
  32. guidellm/scheduler/result.py +137 -0
  33. guidellm/scheduler/scheduler.py +382 -0
  34. guidellm/scheduler/strategy.py +493 -0
  35. guidellm/scheduler/types.py +7 -0
  36. guidellm/scheduler/worker.py +511 -0
  37. guidellm/utils/__init__.py +16 -29
  38. guidellm/utils/colors.py +8 -0
  39. guidellm/utils/hf_transformers.py +35 -0
  40. guidellm/utils/random.py +43 -0
  41. guidellm/utils/text.py +118 -357
  42. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/METADATA +96 -79
  43. guidellm-0.2.0.dist-info/RECORD +48 -0
  44. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/WHEEL +1 -1
  45. guidellm-0.2.0.dist-info/entry_points.txt +2 -0
  46. guidellm/backend/base.py +0 -320
  47. guidellm/core/__init__.py +0 -24
  48. guidellm/core/distribution.py +0 -190
  49. guidellm/core/report.py +0 -321
  50. guidellm/core/request.py +0 -44
  51. guidellm/core/result.py +0 -545
  52. guidellm/core/serializable.py +0 -169
  53. guidellm/executor/__init__.py +0 -10
  54. guidellm/executor/base.py +0 -213
  55. guidellm/executor/profile_generator.py +0 -343
  56. guidellm/main.py +0 -336
  57. guidellm/request/base.py +0 -194
  58. guidellm/request/emulated.py +0 -391
  59. guidellm/request/file.py +0 -76
  60. guidellm/request/transformers.py +0 -100
  61. guidellm/scheduler/base.py +0 -374
  62. guidellm/scheduler/load_generator.py +0 -196
  63. guidellm/utils/injector.py +0 -70
  64. guidellm/utils/progress.py +0 -196
  65. guidellm/utils/transformers.py +0 -151
  66. guidellm-0.1.0.dist-info/RECORD +0 -35
  67. guidellm-0.1.0.dist-info/entry_points.txt +0 -3
  68. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info/licenses}/LICENSE +0 -0
  69. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dist-info}/top_level.txt +0 -0
guidellm/utils/text.py CHANGED
@@ -1,60 +1,76 @@
1
- import csv
2
- import json
1
+ import gzip
3
2
  import re
3
+ import textwrap
4
+ from importlib.resources import as_file, files # type: ignore[attr-defined]
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional, Tuple, Union
6
- from urllib.parse import urlparse
6
+ from typing import Any, Optional, Union
7
7
 
8
8
  import ftfy
9
- import requests
10
- import yaml
9
+ import httpx
11
10
  from loguru import logger
12
11
 
12
+ from guidellm import data as package_data
13
13
  from guidellm.config import settings
14
14
 
15
15
  __all__ = [
16
- "clean_text",
16
+ "split_text_list_by_length",
17
17
  "filter_text",
18
- "is_path",
19
- "is_path_like",
20
- "is_url",
21
- "load_text",
22
- "load_text_lines",
23
- "parse_text_objects",
24
- "split_lines_by_punctuation",
18
+ "clean_text",
25
19
  "split_text",
20
+ "load_text",
21
+ "is_puncutation",
22
+ "EndlessTextCreator",
26
23
  ]
27
24
 
28
-
29
- NAME_TITLES = [
30
- "Mr.",
31
- "Mrs.",
32
- "Ms.",
33
- "Dr.",
34
- "Prof.",
35
- "Jr.",
36
- "Sr.",
37
- "St.",
38
- "Lt.",
39
- "Col.",
40
- "Gen.",
41
- "Rep.",
42
- "Sen.",
43
- "Gov.",
44
- "Pres.",
45
- ]
46
- SENTENCE_REGEX = r'[^.!?]*[.!?]["\']?\s*(?=[A-Z])'
47
- MAX_EXTENSION_LENGTH = 8
48
25
  MAX_PATH_LENGTH = 4096
49
- EXTENSION_TYPES = {
50
- "csv": "csv",
51
- "jsonl": "jsonl",
52
- "json": "json",
53
- "yaml": "yaml",
54
- "yml": "yaml",
55
- "txt": "txt",
56
- "text": "txt",
57
- }
26
+
27
+
28
+ def split_text_list_by_length(
29
+ text_list: list[Any],
30
+ max_characters: Union[int, list[int]],
31
+ pad_horizontal: bool = True,
32
+ pad_vertical: bool = True,
33
+ ) -> list[list[str]]:
34
+ """
35
+ Split a list of strings into a list of strings,
36
+ each with a maximum length of max_characters
37
+
38
+ :param text_list: the list of strings to split
39
+ :param max_characters: the maximum length of each string
40
+ :param pad_horizontal: whether to pad the strings horizontally, defaults to True
41
+ :param pad_vertical: whether to pad the strings vertically, defaults to True
42
+ :return: a list of strings
43
+ """
44
+ if not isinstance(max_characters, list):
45
+ max_characters = [max_characters] * len(text_list)
46
+
47
+ if len(max_characters) != len(text_list):
48
+ raise ValueError(
49
+ f"max_characters must be a list of the same length as text_list, "
50
+ f"but got {len(max_characters)} and {len(text_list)}"
51
+ )
52
+
53
+ result: list[list[str]] = []
54
+ for index, text in enumerate(text_list):
55
+ lines = textwrap.wrap(text, max_characters[index])
56
+ result.append(lines)
57
+
58
+ if pad_vertical:
59
+ max_lines = max(len(lines) for lines in result)
60
+ for lines in result:
61
+ while len(lines) < max_lines:
62
+ lines.append(" ")
63
+
64
+ if pad_horizontal:
65
+ for index in range(len(result)):
66
+ lines = result[index]
67
+ max_chars = max_characters[index]
68
+ new_lines = []
69
+ for line in lines:
70
+ new_lines.append(line.rjust(max_chars))
71
+ result[index] = new_lines
72
+
73
+ return result
58
74
 
59
75
 
60
76
  def filter_text(
@@ -95,216 +111,17 @@ def filter_text(
95
111
  return text
96
112
 
97
113
 
98
- def clean_text(
99
- text: str,
100
- fix_encoding: bool = True,
101
- clean_whitespace: bool = False,
102
- remove_empty_lines: bool = False,
103
- force_new_line_punctuation: bool = False,
104
- ) -> str:
105
- """
106
- Clean text by fixing encoding, cleaning whitespace, removing empty lines,
107
- and forcing new line punctuation
108
-
109
- :param text: the text to clean
110
- :param fix_encoding: True to fix the encoding of the text, False to leave as is
111
- :param clean_whitespace: True to clean the whitespace in the text
112
- (remove extra spaces, tabs, etc), False to leave as is
113
- :param remove_empty_lines: True to remove empty lines from the text
114
- (lines with only whitespace), False to leave as is
115
- :param force_new_line_punctuation: True to force new lines at punctuation
116
- (line ends in a period, exclamation point, or question mark),
117
- False to leave as is
118
- :return: The cleaned text
119
- """
120
-
121
- if fix_encoding:
122
- text = ftfy.fix_text(text)
123
-
124
- if clean_whitespace:
125
- text = "\n".join(
126
- [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
127
- )
128
-
129
- if remove_empty_lines:
130
- text = "\n".join([line for line in text.splitlines() if line.strip()])
131
-
132
- if force_new_line_punctuation:
133
- # first remove any existing new lines
134
- text = " ".join(line for line in text.splitlines() if line.strip())
135
- lines = split_lines_by_punctuation(text)
136
- text = "\n".join(lines)
137
-
138
- return text
139
-
140
-
141
- def split_lines_by_punctuation(text: str) -> List[str]:
142
- """
143
- Split text into lines based on punctuation
144
-
145
- :param text: the text to split
146
- :return: the list of lines
147
- """
148
-
149
- lines = []
150
- current_line = ""
151
- skip_next = False
152
-
153
- for index, char in enumerate(text):
154
- if skip_next:
155
- skip_next = False
156
- continue
157
-
158
- current_line += char
159
-
160
- if char not in [".", "!", "?"]:
161
- # must match end of sentence punctuation
162
- continue
163
-
164
- # if this is the character for a title, don't split
165
- if any(current_line.endswith(title) for title in NAME_TITLES):
166
- continue
167
-
168
- char_next_1 = text[index + 1] if index + 1 < len(text) else None
169
- char_next_2 = text[index + 2] if index + 2 < len(text) else None
170
- char_next_3 = text[index + 3] if index + 3 < len(text) else None
171
-
172
- next_is_space = char_next_1 and char_next_1.isspace()
173
- next_is_quote_and_space = char_next_1 in ["'", '"'] and char_next_2 == " "
174
-
175
- # next character must be a space or a quote, otherwise skip
176
- if not next_is_space and not next_is_quote_and_space:
177
- continue
178
-
179
- # after this, next character must be an upper case letter
180
- upper_char = char_next_3 if next_is_quote_and_space else char_next_2
181
- next_is_upper = upper_char and (
182
- upper_char.isupper() or upper_char in ["'", '"']
183
- )
184
-
185
- if not next_is_upper:
186
- continue
187
-
188
- # if next char is a quote, add it and skip next
189
- if next_is_quote_and_space:
190
- current_line += text[index + 1]
191
- skip_next = True
192
-
193
- lines.append(current_line.strip())
194
- current_line = ""
114
+ def clean_text(text: str) -> str:
115
+ return re.sub(r"\s+", " ", ftfy.fix_text(text)).strip()
195
116
 
196
- if current_line:
197
- lines.append(current_line.strip())
198
117
 
199
- return lines
118
+ def split_text(text: str, split_punctuation: bool = False) -> list[str]:
119
+ text = clean_text(text)
200
120
 
121
+ if split_punctuation:
122
+ return re.findall(r"[\w]+|[.,!?;]", text)
201
123
 
202
- def is_url(url: str) -> bool:
203
- """
204
- Check if a string is a URL
205
-
206
- :param url: the string to check
207
- :return: True if the string is a URL, False if not
208
- """
209
- try:
210
- result = urlparse(url)
211
- return all([result.scheme, result.netloc])
212
- except Exception: # noqa: BLE001
213
- return False
214
-
215
-
216
- def is_path(path: Any) -> bool:
217
- """
218
- Check if a string is a path
219
-
220
- :param path: the string to check
221
- :return: True if the string is a path, False if not
222
- """
223
- if not isinstance(path, (str, Path)):
224
- return False
225
-
226
- if isinstance(path, str):
227
- path = Path(path)
228
-
229
- return path.exists()
230
-
231
-
232
- def is_path_like(path: Any, enforce_file: bool = False) -> bool:
233
- """
234
- Check if a string has a path like structure where it doesn't need to exist
235
-
236
- :param path: the string to check
237
- :param enforce_file: True if the path should be a file, False if not
238
- :return: True if the string is path like, False if not
239
- """
240
- # if path isn't a str or Path, it's not a path
241
- if not isinstance(path, (str, Path)):
242
- return False
243
-
244
- if isinstance(path, Path):
245
- path = str(path)
246
-
247
- # if text is too long, it's not a path (4096 for most linux setups)
248
- if len(path) > MAX_PATH_LENGTH:
249
- return False
250
-
251
- # if it starts with a URL scheme, it's not a path
252
- if path.startswith(("http", "ftp")):
253
- return False
254
-
255
- test_path = Path(path)
256
-
257
- # if it's supposed to be a file and there's no extension or
258
- # the extension is too long, it's not a path
259
- return not enforce_file or (
260
- bool(test_path.suffix) and len(test_path.suffix) <= MAX_EXTENSION_LENGTH
261
- )
262
-
263
-
264
- def split_text(text: str) -> Tuple[List[str], List[str], List[int]]:
265
- """
266
- Split text into words / tokens, the white space separators between words,
267
- and the indices for each new line
268
-
269
- :param text: the text to split
270
- :return: the words, the white space separators, and the new line indices
271
- """
272
- if not text or not text.strip():
273
- return [], [], []
274
-
275
- text = text.strip()
276
- tokens = [] # type: List[str]
277
- separators = [] # type: List[str]
278
- new_lines = [0]
279
- buffer = text[0]
280
- is_token = not text[0].isspace()
281
-
282
- for char in text[1:]:
283
- char_whitespace = char.isspace()
284
-
285
- if char == "\n":
286
- new_lines.append(len(tokens) + 1)
287
-
288
- if char_whitespace and is_token:
289
- tokens.append(buffer)
290
- buffer = char
291
- is_token = False
292
- elif char_whitespace:
293
- buffer += char
294
- elif not char_whitespace and not is_token:
295
- separators.append(buffer)
296
- buffer = char
297
- is_token = True
298
- else:
299
- buffer += char
300
-
301
- if buffer and is_token:
302
- tokens.append(buffer)
303
- separators.append(" ")
304
- elif buffer:
305
- separators.append(buffer)
306
-
307
- return tokens, separators, new_lines
124
+ return text.split()
308
125
 
309
126
 
310
127
  def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
@@ -324,132 +141,76 @@ def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
324
141
  return ""
325
142
 
326
143
  # check URLs
327
- if isinstance(data, str) and data.startswith("http"):
328
- response = requests.get(data, timeout=settings.request_timeout)
329
- response.raise_for_status()
330
- return response.text
331
-
332
- # check raw text
333
- if isinstance(data, str) and not is_path_like(data, enforce_file=True):
144
+ if isinstance(data, str) and data.strip().startswith(("http", "ftp")):
145
+ with httpx.Client(timeout=settings.request_timeout) as client:
146
+ response = client.get(data.strip())
147
+ response.raise_for_status()
148
+ return response.text
149
+
150
+ # check package data
151
+ if isinstance(data, str) and data.startswith("data:"):
152
+ resource_path = files(package_data).joinpath(data[5:])
153
+ with (
154
+ as_file(resource_path) as resource_file,
155
+ gzip.open(resource_file, "rt", encoding=encoding) as file,
156
+ ):
157
+ return file.read()
158
+
159
+ # check gzipped files
160
+ if isinstance(data, str) and data.endswith(".gz"):
161
+ with gzip.open(data, "rt", encoding=encoding) as file:
162
+ return file.read()
163
+
164
+ # check if it's raw text by not being a path
165
+ if isinstance(data, str) and (
166
+ len(data) > MAX_PATH_LENGTH or not Path(data).exists()
167
+ ):
334
168
  return data
335
169
 
336
170
  # assume local file
337
171
  if not isinstance(data, Path):
338
172
  data = Path(data)
339
173
 
340
- if not data.exists():
174
+ if not data.exists() or not data.is_file():
341
175
  raise FileNotFoundError(f"File not found: {data}")
342
176
 
343
- if not data.is_file():
344
- raise IsADirectoryError(f"Path is a directory: {data}")
345
-
346
177
  return data.read_text(encoding=encoding)
347
178
 
348
179
 
349
- def parse_text_objects(data: str, format_: str = "txt") -> List[Dict]:
350
- """
351
- Parse text data into a list of dictionaries based on the format given
352
- (csv, jsonl, json, yaml, txt).
353
-
354
- :param data: the text data to parse
355
- :param format_: the format of the data to parse:
356
- 'csv', 'jsonl', 'json', 'yaml', 'txt'
357
- :return: the list of dictionaries parsed from the data, if text
358
- then each line is a dictionary with a single key 'text'
180
+ def is_puncutation(text: str) -> bool:
359
181
  """
360
- if not isinstance(data, str):
361
- raise ValueError(f"Unsupported data given of type: {type(data)}")
362
-
363
- if format_ == "csv":
364
- reader = csv.DictReader(data.splitlines())
365
- columns = reader.fieldnames
366
- return [{col: row[col] for col in columns} for row in reader] # type: ignore # noqa: PGH003
182
+ Check if the text is a punctuation
367
183
 
368
- if format_ == "jsonl":
369
- return [json.loads(line) for line in data.splitlines() if line]
370
-
371
- if format_ in ("json", "yaml"):
372
- data = json.loads(data) if format_ == "json" else yaml.safe_load(data)
373
-
374
- if not data:
375
- return []
376
-
377
- if isinstance(data, dict) and len(data) == 1:
378
- logger.debug("Getting first value from JSON/YAML object: {}", data)
379
- data = list(data.values())[0]
380
- elif isinstance(data, dict):
381
- logger.debug("Converting JSON/YAML object to list: {}", data)
382
- data = list(data.values())
383
-
384
- if not isinstance(data, list) or not isinstance(data[0], dict):
385
- raise ValueError(f"Unsupported data structure given: {data}")
386
-
387
- return data
388
-
389
- if format_ == "txt":
390
- return [{"text": line} for line in data.splitlines() if line]
391
-
392
- raise ValueError(f"Unsupported format given: {format_}")
393
-
394
-
395
- def load_text_lines(
396
- data: Union[str, Path, List[Dict]],
397
- format_: Optional[str] = None,
398
- filters: Optional[List[str]] = None,
399
- encoding: Optional[str] = None,
400
- ) -> List[str]:
184
+ :param text: the text to check
185
+ :type text: str
186
+ :return: True if the text is a punctuation, False otherwise
187
+ :rtype: bool
401
188
  """
402
- Load text lines from a file or data object with optional filtering and formatting.
403
-
404
-
405
- :param data: the data to load the text lines from
406
- :param format_: the format of the data to load, if not provided will be inferred.
407
- Supported formats: 'csv', 'jsonl', 'json', 'yaml', 'txt'
408
- :param filters: the keys to filter the data by when loading in order of preference.
409
- If not provided, will use the first key in the data object.
410
- :param encoding: the encoding to use when reading the file
411
- :return: the list of text lines
412
- """
413
- logger.debug(
414
- "Loading text lines with format {}, filters {}, encoding {} for data: {}",
415
- format_,
416
- filters,
417
- encoding,
418
- data,
419
- )
420
-
421
- if not data:
422
- return []
423
-
424
- if not format_ and isinstance(data, (str, Path)) and "." in str(data):
425
- extension = str(data).split(".")[-1]
426
- format_ = EXTENSION_TYPES.get(extension, "txt")
427
- elif not format_:
428
- format_ = "txt"
189
+ return len(text) == 1 and not text.isalnum() and not text.isspace()
429
190
 
430
- # load the data if it's a path or URL
431
- if isinstance(data, Path) or (isinstance(data, str) and data.startswith("http")):
432
- data = load_text(data, encoding=encoding)
433
- data = clean_text(data)
434
191
 
435
- # parse the data into a list of dictionaries based on the format
436
- if isinstance(data, str):
437
- data = parse_text_objects(data, format_)
192
+ class EndlessTextCreator:
193
+ def __init__(
194
+ self,
195
+ data: Union[str, Path],
196
+ filter_start: Optional[Union[str, int]] = None,
197
+ filter_end: Optional[Union[str, int]] = None,
198
+ ):
199
+ self.data = data
200
+ self.text = load_text(data)
201
+ self.filtered_text = filter_text(self.text, filter_start, filter_end)
202
+ self.words = split_text(self.filtered_text, split_punctuation=True)
438
203
 
439
- if not isinstance(data, list):
440
- raise ValueError(f"Unsupported data given of type: {type(data)}")
204
+ def create_text(self, start: int, length: int) -> str:
205
+ text = ""
441
206
 
442
- if not isinstance(data[0], dict):
443
- raise ValueError(f"Unsupported data item type given: {type(data[0])}")
207
+ for counter in range(length):
208
+ index = (start + counter) % len(self.words)
209
+ add_word = self.words[index]
444
210
 
445
- # grab the first available filter key to use if preference order as provided
446
- filter_ = list(data[0].keys())[0]
447
- for filt in filters or []:
448
- if filt not in data[0]:
449
- continue
211
+ if counter != 0 and not is_puncutation(add_word):
212
+ text += " "
450
213
 
451
- filter_ = filt
452
- break
214
+ text += add_word
453
215
 
454
- # extract the lines from the data
455
- return [row[filter_] for row in data] if filter_ else [str(row) for row in data]
216
+ return text