guidellm 0.1.0__py3-none-any.whl → 0.2.0rc20250418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +38 -6
- guidellm/__main__.py +294 -0
- guidellm/backend/__init__.py +19 -6
- guidellm/backend/backend.py +238 -0
- guidellm/backend/openai.py +532 -122
- guidellm/backend/response.py +132 -0
- guidellm/benchmark/__init__.py +73 -0
- guidellm/benchmark/aggregator.py +760 -0
- guidellm/benchmark/benchmark.py +838 -0
- guidellm/benchmark/benchmarker.py +334 -0
- guidellm/benchmark/entrypoints.py +141 -0
- guidellm/benchmark/output.py +946 -0
- guidellm/benchmark/profile.py +409 -0
- guidellm/benchmark/progress.py +720 -0
- guidellm/config.py +34 -56
- guidellm/data/__init__.py +4 -0
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +22 -0
- guidellm/dataset/creator.py +213 -0
- guidellm/dataset/entrypoints.py +42 -0
- guidellm/dataset/file.py +90 -0
- guidellm/dataset/hf_datasets.py +62 -0
- guidellm/dataset/in_memory.py +132 -0
- guidellm/dataset/synthetic.py +262 -0
- guidellm/objects/__init__.py +18 -0
- guidellm/objects/pydantic.py +60 -0
- guidellm/objects/statistics.py +947 -0
- guidellm/request/__init__.py +12 -10
- guidellm/request/loader.py +281 -0
- guidellm/request/request.py +79 -0
- guidellm/scheduler/__init__.py +51 -3
- guidellm/scheduler/result.py +137 -0
- guidellm/scheduler/scheduler.py +382 -0
- guidellm/scheduler/strategy.py +493 -0
- guidellm/scheduler/types.py +7 -0
- guidellm/scheduler/worker.py +511 -0
- guidellm/utils/__init__.py +16 -29
- guidellm/utils/colors.py +8 -0
- guidellm/utils/hf_transformers.py +35 -0
- guidellm/utils/random.py +43 -0
- guidellm/utils/text.py +118 -357
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/METADATA +96 -79
- guidellm-0.2.0rc20250418.dist-info/RECORD +48 -0
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/WHEEL +1 -1
- guidellm-0.2.0rc20250418.dist-info/entry_points.txt +2 -0
- guidellm/backend/base.py +0 -320
- guidellm/core/__init__.py +0 -24
- guidellm/core/distribution.py +0 -190
- guidellm/core/report.py +0 -321
- guidellm/core/request.py +0 -44
- guidellm/core/result.py +0 -545
- guidellm/core/serializable.py +0 -169
- guidellm/executor/__init__.py +0 -10
- guidellm/executor/base.py +0 -213
- guidellm/executor/profile_generator.py +0 -343
- guidellm/main.py +0 -336
- guidellm/request/base.py +0 -194
- guidellm/request/emulated.py +0 -391
- guidellm/request/file.py +0 -76
- guidellm/request/transformers.py +0 -100
- guidellm/scheduler/base.py +0 -374
- guidellm/scheduler/load_generator.py +0 -196
- guidellm/utils/injector.py +0 -70
- guidellm/utils/progress.py +0 -196
- guidellm/utils/transformers.py +0 -151
- guidellm-0.1.0.dist-info/RECORD +0 -35
- guidellm-0.1.0.dist-info/entry_points.txt +0 -3
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info/licenses}/LICENSE +0 -0
- {guidellm-0.1.0.dist-info → guidellm-0.2.0rc20250418.dist-info}/top_level.txt +0 -0
guidellm/utils/text.py
CHANGED
|
@@ -1,60 +1,76 @@
|
|
|
1
|
-
import
|
|
2
|
-
import json
|
|
1
|
+
import gzip
|
|
3
2
|
import re
|
|
3
|
+
import textwrap
|
|
4
|
+
from importlib.resources import as_file, files # type: ignore[attr-defined]
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Any,
|
|
6
|
-
from urllib.parse import urlparse
|
|
6
|
+
from typing import Any, Optional, Union
|
|
7
7
|
|
|
8
8
|
import ftfy
|
|
9
|
-
import
|
|
10
|
-
import yaml
|
|
9
|
+
import httpx
|
|
11
10
|
from loguru import logger
|
|
12
11
|
|
|
12
|
+
from guidellm import data as package_data
|
|
13
13
|
from guidellm.config import settings
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
16
|
-
"
|
|
16
|
+
"split_text_list_by_length",
|
|
17
17
|
"filter_text",
|
|
18
|
-
"
|
|
19
|
-
"is_path_like",
|
|
20
|
-
"is_url",
|
|
21
|
-
"load_text",
|
|
22
|
-
"load_text_lines",
|
|
23
|
-
"parse_text_objects",
|
|
24
|
-
"split_lines_by_punctuation",
|
|
18
|
+
"clean_text",
|
|
25
19
|
"split_text",
|
|
20
|
+
"load_text",
|
|
21
|
+
"is_puncutation",
|
|
22
|
+
"EndlessTextCreator",
|
|
26
23
|
]
|
|
27
24
|
|
|
28
|
-
|
|
29
|
-
NAME_TITLES = [
|
|
30
|
-
"Mr.",
|
|
31
|
-
"Mrs.",
|
|
32
|
-
"Ms.",
|
|
33
|
-
"Dr.",
|
|
34
|
-
"Prof.",
|
|
35
|
-
"Jr.",
|
|
36
|
-
"Sr.",
|
|
37
|
-
"St.",
|
|
38
|
-
"Lt.",
|
|
39
|
-
"Col.",
|
|
40
|
-
"Gen.",
|
|
41
|
-
"Rep.",
|
|
42
|
-
"Sen.",
|
|
43
|
-
"Gov.",
|
|
44
|
-
"Pres.",
|
|
45
|
-
]
|
|
46
|
-
SENTENCE_REGEX = r'[^.!?]*[.!?]["\']?\s*(?=[A-Z])'
|
|
47
|
-
MAX_EXTENSION_LENGTH = 8
|
|
48
25
|
MAX_PATH_LENGTH = 4096
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def split_text_list_by_length(
|
|
29
|
+
text_list: list[Any],
|
|
30
|
+
max_characters: Union[int, list[int]],
|
|
31
|
+
pad_horizontal: bool = True,
|
|
32
|
+
pad_vertical: bool = True,
|
|
33
|
+
) -> list[list[str]]:
|
|
34
|
+
"""
|
|
35
|
+
Split a list of strings into a list of strings,
|
|
36
|
+
each with a maximum length of max_characters
|
|
37
|
+
|
|
38
|
+
:param text_list: the list of strings to split
|
|
39
|
+
:param max_characters: the maximum length of each string
|
|
40
|
+
:param pad_horizontal: whether to pad the strings horizontally, defaults to True
|
|
41
|
+
:param pad_vertical: whether to pad the strings vertically, defaults to True
|
|
42
|
+
:return: a list of strings
|
|
43
|
+
"""
|
|
44
|
+
if not isinstance(max_characters, list):
|
|
45
|
+
max_characters = [max_characters] * len(text_list)
|
|
46
|
+
|
|
47
|
+
if len(max_characters) != len(text_list):
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"max_characters must be a list of the same length as text_list, "
|
|
50
|
+
f"but got {len(max_characters)} and {len(text_list)}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
result: list[list[str]] = []
|
|
54
|
+
for index, text in enumerate(text_list):
|
|
55
|
+
lines = textwrap.wrap(text, max_characters[index])
|
|
56
|
+
result.append(lines)
|
|
57
|
+
|
|
58
|
+
if pad_vertical:
|
|
59
|
+
max_lines = max(len(lines) for lines in result)
|
|
60
|
+
for lines in result:
|
|
61
|
+
while len(lines) < max_lines:
|
|
62
|
+
lines.append(" ")
|
|
63
|
+
|
|
64
|
+
if pad_horizontal:
|
|
65
|
+
for index in range(len(result)):
|
|
66
|
+
lines = result[index]
|
|
67
|
+
max_chars = max_characters[index]
|
|
68
|
+
new_lines = []
|
|
69
|
+
for line in lines:
|
|
70
|
+
new_lines.append(line.rjust(max_chars))
|
|
71
|
+
result[index] = new_lines
|
|
72
|
+
|
|
73
|
+
return result
|
|
58
74
|
|
|
59
75
|
|
|
60
76
|
def filter_text(
|
|
@@ -95,216 +111,17 @@ def filter_text(
|
|
|
95
111
|
return text
|
|
96
112
|
|
|
97
113
|
|
|
98
|
-
def clean_text(
|
|
99
|
-
|
|
100
|
-
fix_encoding: bool = True,
|
|
101
|
-
clean_whitespace: bool = False,
|
|
102
|
-
remove_empty_lines: bool = False,
|
|
103
|
-
force_new_line_punctuation: bool = False,
|
|
104
|
-
) -> str:
|
|
105
|
-
"""
|
|
106
|
-
Clean text by fixing encoding, cleaning whitespace, removing empty lines,
|
|
107
|
-
and forcing new line punctuation
|
|
108
|
-
|
|
109
|
-
:param text: the text to clean
|
|
110
|
-
:param fix_encoding: True to fix the encoding of the text, False to leave as is
|
|
111
|
-
:param clean_whitespace: True to clean the whitespace in the text
|
|
112
|
-
(remove extra spaces, tabs, etc), False to leave as is
|
|
113
|
-
:param remove_empty_lines: True to remove empty lines from the text
|
|
114
|
-
(lines with only whitespace), False to leave as is
|
|
115
|
-
:param force_new_line_punctuation: True to force new lines at punctuation
|
|
116
|
-
(line ends in a period, exclamation point, or question mark),
|
|
117
|
-
False to leave as is
|
|
118
|
-
:return: The cleaned text
|
|
119
|
-
"""
|
|
120
|
-
|
|
121
|
-
if fix_encoding:
|
|
122
|
-
text = ftfy.fix_text(text)
|
|
123
|
-
|
|
124
|
-
if clean_whitespace:
|
|
125
|
-
text = "\n".join(
|
|
126
|
-
[re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
if remove_empty_lines:
|
|
130
|
-
text = "\n".join([line for line in text.splitlines() if line.strip()])
|
|
131
|
-
|
|
132
|
-
if force_new_line_punctuation:
|
|
133
|
-
# first remove any existing new lines
|
|
134
|
-
text = " ".join(line for line in text.splitlines() if line.strip())
|
|
135
|
-
lines = split_lines_by_punctuation(text)
|
|
136
|
-
text = "\n".join(lines)
|
|
137
|
-
|
|
138
|
-
return text
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def split_lines_by_punctuation(text: str) -> List[str]:
|
|
142
|
-
"""
|
|
143
|
-
Split text into lines based on punctuation
|
|
144
|
-
|
|
145
|
-
:param text: the text to split
|
|
146
|
-
:return: the list of lines
|
|
147
|
-
"""
|
|
148
|
-
|
|
149
|
-
lines = []
|
|
150
|
-
current_line = ""
|
|
151
|
-
skip_next = False
|
|
152
|
-
|
|
153
|
-
for index, char in enumerate(text):
|
|
154
|
-
if skip_next:
|
|
155
|
-
skip_next = False
|
|
156
|
-
continue
|
|
157
|
-
|
|
158
|
-
current_line += char
|
|
159
|
-
|
|
160
|
-
if char not in [".", "!", "?"]:
|
|
161
|
-
# must match end of sentence punctuation
|
|
162
|
-
continue
|
|
163
|
-
|
|
164
|
-
# if this is the character for a title, don't split
|
|
165
|
-
if any(current_line.endswith(title) for title in NAME_TITLES):
|
|
166
|
-
continue
|
|
167
|
-
|
|
168
|
-
char_next_1 = text[index + 1] if index + 1 < len(text) else None
|
|
169
|
-
char_next_2 = text[index + 2] if index + 2 < len(text) else None
|
|
170
|
-
char_next_3 = text[index + 3] if index + 3 < len(text) else None
|
|
171
|
-
|
|
172
|
-
next_is_space = char_next_1 and char_next_1.isspace()
|
|
173
|
-
next_is_quote_and_space = char_next_1 in ["'", '"'] and char_next_2 == " "
|
|
174
|
-
|
|
175
|
-
# next character must be a space or a quote, otherwise skip
|
|
176
|
-
if not next_is_space and not next_is_quote_and_space:
|
|
177
|
-
continue
|
|
178
|
-
|
|
179
|
-
# after this, next character must be an upper case letter
|
|
180
|
-
upper_char = char_next_3 if next_is_quote_and_space else char_next_2
|
|
181
|
-
next_is_upper = upper_char and (
|
|
182
|
-
upper_char.isupper() or upper_char in ["'", '"']
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
if not next_is_upper:
|
|
186
|
-
continue
|
|
187
|
-
|
|
188
|
-
# if next char is a quote, add it and skip next
|
|
189
|
-
if next_is_quote_and_space:
|
|
190
|
-
current_line += text[index + 1]
|
|
191
|
-
skip_next = True
|
|
192
|
-
|
|
193
|
-
lines.append(current_line.strip())
|
|
194
|
-
current_line = ""
|
|
114
|
+
def clean_text(text: str) -> str:
|
|
115
|
+
return re.sub(r"\s+", " ", ftfy.fix_text(text)).strip()
|
|
195
116
|
|
|
196
|
-
if current_line:
|
|
197
|
-
lines.append(current_line.strip())
|
|
198
117
|
|
|
199
|
-
|
|
118
|
+
def split_text(text: str, split_punctuation: bool = False) -> list[str]:
|
|
119
|
+
text = clean_text(text)
|
|
200
120
|
|
|
121
|
+
if split_punctuation:
|
|
122
|
+
return re.findall(r"[\w]+|[.,!?;]", text)
|
|
201
123
|
|
|
202
|
-
|
|
203
|
-
"""
|
|
204
|
-
Check if a string is a URL
|
|
205
|
-
|
|
206
|
-
:param url: the string to check
|
|
207
|
-
:return: True if the string is a URL, False if not
|
|
208
|
-
"""
|
|
209
|
-
try:
|
|
210
|
-
result = urlparse(url)
|
|
211
|
-
return all([result.scheme, result.netloc])
|
|
212
|
-
except Exception: # noqa: BLE001
|
|
213
|
-
return False
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def is_path(path: Any) -> bool:
|
|
217
|
-
"""
|
|
218
|
-
Check if a string is a path
|
|
219
|
-
|
|
220
|
-
:param path: the string to check
|
|
221
|
-
:return: True if the string is a path, False if not
|
|
222
|
-
"""
|
|
223
|
-
if not isinstance(path, (str, Path)):
|
|
224
|
-
return False
|
|
225
|
-
|
|
226
|
-
if isinstance(path, str):
|
|
227
|
-
path = Path(path)
|
|
228
|
-
|
|
229
|
-
return path.exists()
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def is_path_like(path: Any, enforce_file: bool = False) -> bool:
|
|
233
|
-
"""
|
|
234
|
-
Check if a string has a path like structure where it doesn't need to exist
|
|
235
|
-
|
|
236
|
-
:param path: the string to check
|
|
237
|
-
:param enforce_file: True if the path should be a file, False if not
|
|
238
|
-
:return: True if the string is path like, False if not
|
|
239
|
-
"""
|
|
240
|
-
# if path isn't a str or Path, it's not a path
|
|
241
|
-
if not isinstance(path, (str, Path)):
|
|
242
|
-
return False
|
|
243
|
-
|
|
244
|
-
if isinstance(path, Path):
|
|
245
|
-
path = str(path)
|
|
246
|
-
|
|
247
|
-
# if text is too long, it's not a path (4096 for most linux setups)
|
|
248
|
-
if len(path) > MAX_PATH_LENGTH:
|
|
249
|
-
return False
|
|
250
|
-
|
|
251
|
-
# if it starts with a URL scheme, it's not a path
|
|
252
|
-
if path.startswith(("http", "ftp")):
|
|
253
|
-
return False
|
|
254
|
-
|
|
255
|
-
test_path = Path(path)
|
|
256
|
-
|
|
257
|
-
# if it's supposed to be a file and there's no extension or
|
|
258
|
-
# the extension is too long, it's not a path
|
|
259
|
-
return not enforce_file or (
|
|
260
|
-
bool(test_path.suffix) and len(test_path.suffix) <= MAX_EXTENSION_LENGTH
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
def split_text(text: str) -> Tuple[List[str], List[str], List[int]]:
|
|
265
|
-
"""
|
|
266
|
-
Split text into words / tokens, the white space separators between words,
|
|
267
|
-
and the indices for each new line
|
|
268
|
-
|
|
269
|
-
:param text: the text to split
|
|
270
|
-
:return: the words, the white space separators, and the new line indices
|
|
271
|
-
"""
|
|
272
|
-
if not text or not text.strip():
|
|
273
|
-
return [], [], []
|
|
274
|
-
|
|
275
|
-
text = text.strip()
|
|
276
|
-
tokens = [] # type: List[str]
|
|
277
|
-
separators = [] # type: List[str]
|
|
278
|
-
new_lines = [0]
|
|
279
|
-
buffer = text[0]
|
|
280
|
-
is_token = not text[0].isspace()
|
|
281
|
-
|
|
282
|
-
for char in text[1:]:
|
|
283
|
-
char_whitespace = char.isspace()
|
|
284
|
-
|
|
285
|
-
if char == "\n":
|
|
286
|
-
new_lines.append(len(tokens) + 1)
|
|
287
|
-
|
|
288
|
-
if char_whitespace and is_token:
|
|
289
|
-
tokens.append(buffer)
|
|
290
|
-
buffer = char
|
|
291
|
-
is_token = False
|
|
292
|
-
elif char_whitespace:
|
|
293
|
-
buffer += char
|
|
294
|
-
elif not char_whitespace and not is_token:
|
|
295
|
-
separators.append(buffer)
|
|
296
|
-
buffer = char
|
|
297
|
-
is_token = True
|
|
298
|
-
else:
|
|
299
|
-
buffer += char
|
|
300
|
-
|
|
301
|
-
if buffer and is_token:
|
|
302
|
-
tokens.append(buffer)
|
|
303
|
-
separators.append(" ")
|
|
304
|
-
elif buffer:
|
|
305
|
-
separators.append(buffer)
|
|
306
|
-
|
|
307
|
-
return tokens, separators, new_lines
|
|
124
|
+
return text.split()
|
|
308
125
|
|
|
309
126
|
|
|
310
127
|
def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
|
|
@@ -324,132 +141,76 @@ def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
|
|
|
324
141
|
return ""
|
|
325
142
|
|
|
326
143
|
# check URLs
|
|
327
|
-
if isinstance(data, str) and data.startswith("http"):
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
144
|
+
if isinstance(data, str) and data.strip().startswith(("http", "ftp")):
|
|
145
|
+
with httpx.Client(timeout=settings.request_timeout) as client:
|
|
146
|
+
response = client.get(data.strip())
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
return response.text
|
|
149
|
+
|
|
150
|
+
# check package data
|
|
151
|
+
if isinstance(data, str) and data.startswith("data:"):
|
|
152
|
+
resource_path = files(package_data).joinpath(data[5:])
|
|
153
|
+
with (
|
|
154
|
+
as_file(resource_path) as resource_file,
|
|
155
|
+
gzip.open(resource_file, "rt", encoding=encoding) as file,
|
|
156
|
+
):
|
|
157
|
+
return file.read()
|
|
158
|
+
|
|
159
|
+
# check gzipped files
|
|
160
|
+
if isinstance(data, str) and data.endswith(".gz"):
|
|
161
|
+
with gzip.open(data, "rt", encoding=encoding) as file:
|
|
162
|
+
return file.read()
|
|
163
|
+
|
|
164
|
+
# check if it's raw text by not being a path
|
|
165
|
+
if isinstance(data, str) and (
|
|
166
|
+
len(data) > MAX_PATH_LENGTH or not Path(data).exists()
|
|
167
|
+
):
|
|
334
168
|
return data
|
|
335
169
|
|
|
336
170
|
# assume local file
|
|
337
171
|
if not isinstance(data, Path):
|
|
338
172
|
data = Path(data)
|
|
339
173
|
|
|
340
|
-
if not data.exists():
|
|
174
|
+
if not data.exists() or not data.is_file():
|
|
341
175
|
raise FileNotFoundError(f"File not found: {data}")
|
|
342
176
|
|
|
343
|
-
if not data.is_file():
|
|
344
|
-
raise IsADirectoryError(f"Path is a directory: {data}")
|
|
345
|
-
|
|
346
177
|
return data.read_text(encoding=encoding)
|
|
347
178
|
|
|
348
179
|
|
|
349
|
-
def
|
|
350
|
-
"""
|
|
351
|
-
Parse text data into a list of dictionaries based on the format given
|
|
352
|
-
(csv, jsonl, json, yaml, txt).
|
|
353
|
-
|
|
354
|
-
:param data: the text data to parse
|
|
355
|
-
:param format_: the format of the data to parse:
|
|
356
|
-
'csv', 'jsonl', 'json', 'yaml', 'txt'
|
|
357
|
-
:return: the list of dictionaries parsed from the data, if text
|
|
358
|
-
then each line is a dictionary with a single key 'text'
|
|
180
|
+
def is_puncutation(text: str) -> bool:
|
|
359
181
|
"""
|
|
360
|
-
if
|
|
361
|
-
raise ValueError(f"Unsupported data given of type: {type(data)}")
|
|
362
|
-
|
|
363
|
-
if format_ == "csv":
|
|
364
|
-
reader = csv.DictReader(data.splitlines())
|
|
365
|
-
columns = reader.fieldnames
|
|
366
|
-
return [{col: row[col] for col in columns} for row in reader] # type: ignore # noqa: PGH003
|
|
182
|
+
Check if the text is a punctuation
|
|
367
183
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
data = json.loads(data) if format_ == "json" else yaml.safe_load(data)
|
|
373
|
-
|
|
374
|
-
if not data:
|
|
375
|
-
return []
|
|
376
|
-
|
|
377
|
-
if isinstance(data, dict) and len(data) == 1:
|
|
378
|
-
logger.debug("Getting first value from JSON/YAML object: {}", data)
|
|
379
|
-
data = list(data.values())[0]
|
|
380
|
-
elif isinstance(data, dict):
|
|
381
|
-
logger.debug("Converting JSON/YAML object to list: {}", data)
|
|
382
|
-
data = list(data.values())
|
|
383
|
-
|
|
384
|
-
if not isinstance(data, list) or not isinstance(data[0], dict):
|
|
385
|
-
raise ValueError(f"Unsupported data structure given: {data}")
|
|
386
|
-
|
|
387
|
-
return data
|
|
388
|
-
|
|
389
|
-
if format_ == "txt":
|
|
390
|
-
return [{"text": line} for line in data.splitlines() if line]
|
|
391
|
-
|
|
392
|
-
raise ValueError(f"Unsupported format given: {format_}")
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
def load_text_lines(
|
|
396
|
-
data: Union[str, Path, List[Dict]],
|
|
397
|
-
format_: Optional[str] = None,
|
|
398
|
-
filters: Optional[List[str]] = None,
|
|
399
|
-
encoding: Optional[str] = None,
|
|
400
|
-
) -> List[str]:
|
|
184
|
+
:param text: the text to check
|
|
185
|
+
:type text: str
|
|
186
|
+
:return: True if the text is a punctuation, False otherwise
|
|
187
|
+
:rtype: bool
|
|
401
188
|
"""
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
:param data: the data to load the text lines from
|
|
406
|
-
:param format_: the format of the data to load, if not provided will be inferred.
|
|
407
|
-
Supported formats: 'csv', 'jsonl', 'json', 'yaml', 'txt'
|
|
408
|
-
:param filters: the keys to filter the data by when loading in order of preference.
|
|
409
|
-
If not provided, will use the first key in the data object.
|
|
410
|
-
:param encoding: the encoding to use when reading the file
|
|
411
|
-
:return: the list of text lines
|
|
412
|
-
"""
|
|
413
|
-
logger.debug(
|
|
414
|
-
"Loading text lines with format {}, filters {}, encoding {} for data: {}",
|
|
415
|
-
format_,
|
|
416
|
-
filters,
|
|
417
|
-
encoding,
|
|
418
|
-
data,
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
if not data:
|
|
422
|
-
return []
|
|
423
|
-
|
|
424
|
-
if not format_ and isinstance(data, (str, Path)) and "." in str(data):
|
|
425
|
-
extension = str(data).split(".")[-1]
|
|
426
|
-
format_ = EXTENSION_TYPES.get(extension, "txt")
|
|
427
|
-
elif not format_:
|
|
428
|
-
format_ = "txt"
|
|
189
|
+
return len(text) == 1 and not text.isalnum() and not text.isspace()
|
|
429
190
|
|
|
430
|
-
# load the data if it's a path or URL
|
|
431
|
-
if isinstance(data, Path) or (isinstance(data, str) and data.startswith("http")):
|
|
432
|
-
data = load_text(data, encoding=encoding)
|
|
433
|
-
data = clean_text(data)
|
|
434
191
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
192
|
+
class EndlessTextCreator:
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
data: Union[str, Path],
|
|
196
|
+
filter_start: Optional[Union[str, int]] = None,
|
|
197
|
+
filter_end: Optional[Union[str, int]] = None,
|
|
198
|
+
):
|
|
199
|
+
self.data = data
|
|
200
|
+
self.text = load_text(data)
|
|
201
|
+
self.filtered_text = filter_text(self.text, filter_start, filter_end)
|
|
202
|
+
self.words = split_text(self.filtered_text, split_punctuation=True)
|
|
438
203
|
|
|
439
|
-
|
|
440
|
-
|
|
204
|
+
def create_text(self, start: int, length: int) -> str:
|
|
205
|
+
text = ""
|
|
441
206
|
|
|
442
|
-
|
|
443
|
-
|
|
207
|
+
for counter in range(length):
|
|
208
|
+
index = (start + counter) % len(self.words)
|
|
209
|
+
add_word = self.words[index]
|
|
444
210
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
for filt in filters or []:
|
|
448
|
-
if filt not in data[0]:
|
|
449
|
-
continue
|
|
211
|
+
if counter != 0 and not is_puncutation(add_word):
|
|
212
|
+
text += " "
|
|
450
213
|
|
|
451
|
-
|
|
452
|
-
break
|
|
214
|
+
text += add_word
|
|
453
215
|
|
|
454
|
-
|
|
455
|
-
return [row[filter_] for row in data] if filter_ else [str(row) for row in data]
|
|
216
|
+
return text
|