jb_async_utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
async_utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .async_utils import *
|
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
import heapq
|
|
5
|
+
import random
|
|
6
|
+
import struct
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
import datetime
|
|
10
|
+
from collections import deque
|
|
11
|
+
|
|
12
|
+
import aiohttp
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
logging.basicConfig(
|
|
16
|
+
format="%(asctime)s - %(name)s - %(message)s",
|
|
17
|
+
datefmt="%Y/%m/%d %H:%M:%S",
|
|
18
|
+
level=logging.INFO,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
basic
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BasicTaskDatum:
|
|
27
|
+
def __init__(
|
|
28
|
+
self, task_id, data,
|
|
29
|
+
):
|
|
30
|
+
self.task_id = task_id
|
|
31
|
+
self.data = data
|
|
32
|
+
|
|
33
|
+
self.run_id = 0
|
|
34
|
+
self.start_time = 0
|
|
35
|
+
self.end_time = 0
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
def get_log_string(self):
|
|
39
|
+
return f"task#{self.task_id} run#{self.run_id}"
|
|
40
|
+
|
|
41
|
+
def get_json_obj(self):
|
|
42
|
+
start_time = datetime.datetime.fromtimestamp(self.start_time).isoformat()
|
|
43
|
+
end_time = datetime.datetime.fromtimestamp(self.end_time).isoformat()
|
|
44
|
+
|
|
45
|
+
json_obj = {
|
|
46
|
+
"task_id": self.task_id,
|
|
47
|
+
"data": self.data,
|
|
48
|
+
"start_time": start_time,
|
|
49
|
+
"end_time": end_time,
|
|
50
|
+
}
|
|
51
|
+
return json_obj
|
|
52
|
+
|
|
53
|
+
def finish(self):
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class BasicQuotaManager:
|
|
58
|
+
def __init__(self):
|
|
59
|
+
self.runs_per_minute = 5
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
def has_enough_quota(self, init_task_datum):
|
|
63
|
+
return self.runs_per_minute > 0
|
|
64
|
+
|
|
65
|
+
def reclaim_quota(self, done_task_datum_queue):
|
|
66
|
+
while done_task_datum_queue:
|
|
67
|
+
end_time, _done_task_datum_queue_id, _done_task_datum = done_task_datum_queue[0]
|
|
68
|
+
if end_time >= time.time() - 60:
|
|
69
|
+
break
|
|
70
|
+
heapq.heappop(done_task_datum_queue)
|
|
71
|
+
self.runs_per_minute += 1
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
def deduct_quota(self, init_task_datum):
|
|
75
|
+
self.runs_per_minute -= 1
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def process_batch_data(
|
|
80
|
+
input_file, output_file, task_datum_class, task_runner, quota_manager,
|
|
81
|
+
max_task_runs=1, start_id=None, end_id=None, ignore_and_rewrite_output_file=False,
|
|
82
|
+
sleep_interval=0.001,
|
|
83
|
+
):
|
|
84
|
+
# input file
|
|
85
|
+
fr = open(input_file, "r", encoding="utf8")
|
|
86
|
+
input_task_id = 0
|
|
87
|
+
no_more_input = False
|
|
88
|
+
|
|
89
|
+
# tasks
|
|
90
|
+
todo_task_datum_queue = deque()
|
|
91
|
+
running_task_to_datum = {}
|
|
92
|
+
done_task_datum_queue = []
|
|
93
|
+
done_task_datum_queue_next_id = 0
|
|
94
|
+
|
|
95
|
+
# output file
|
|
96
|
+
completed_task_id_set = set()
|
|
97
|
+
if not ignore_and_rewrite_output_file and os.path.exists(output_file):
|
|
98
|
+
with open(output_file, "r", encoding="utf8") as f:
|
|
99
|
+
for line in f:
|
|
100
|
+
datum = json.loads(line)
|
|
101
|
+
completed_task_id_set.add(datum["task_id"])
|
|
102
|
+
|
|
103
|
+
output_mode = "w" if ignore_and_rewrite_output_file else "a"
|
|
104
|
+
fw = open(output_file, output_mode, encoding="utf8")
|
|
105
|
+
|
|
106
|
+
# loop
|
|
107
|
+
while True:
|
|
108
|
+
await asyncio.sleep(sleep_interval)
|
|
109
|
+
|
|
110
|
+
# step 1: loop through running tasks: check and process their completion
|
|
111
|
+
new_running_task_to_datum = {}
|
|
112
|
+
|
|
113
|
+
for running_task, running_task_datum in running_task_to_datum.items():
|
|
114
|
+
if running_task.done():
|
|
115
|
+
try:
|
|
116
|
+
_ = running_task.result()
|
|
117
|
+
successful = True
|
|
118
|
+
except:
|
|
119
|
+
successful = False
|
|
120
|
+
|
|
121
|
+
if successful:
|
|
122
|
+
running_task_datum.finish()
|
|
123
|
+
logger.info(f"[success] {running_task_datum.get_log_string()}")
|
|
124
|
+
json.dump(running_task_datum.get_json_obj(), fw, ensure_ascii=False)
|
|
125
|
+
fw.write("\n")
|
|
126
|
+
fw.flush()
|
|
127
|
+
else:
|
|
128
|
+
running_task_datum.end_time = time.time()
|
|
129
|
+
if running_task_datum.run_id < max_task_runs:
|
|
130
|
+
logger.info(f"[error] {running_task_datum.get_log_string()}")
|
|
131
|
+
todo_task_datum_queue.append(running_task_datum)
|
|
132
|
+
else:
|
|
133
|
+
logger.info(f"[error] [quit] {running_task_datum.get_log_string()}")
|
|
134
|
+
|
|
135
|
+
heapq.heappush(
|
|
136
|
+
done_task_datum_queue,
|
|
137
|
+
(
|
|
138
|
+
running_task_datum.end_time,
|
|
139
|
+
done_task_datum_queue_next_id,
|
|
140
|
+
running_task_datum,
|
|
141
|
+
),
|
|
142
|
+
)
|
|
143
|
+
done_task_datum_queue_next_id += 1
|
|
144
|
+
else:
|
|
145
|
+
new_running_task_to_datum[running_task] = running_task_datum
|
|
146
|
+
|
|
147
|
+
running_task_to_datum = new_running_task_to_datum
|
|
148
|
+
|
|
149
|
+
# step 2: loop through done tasks: reclaim quota
|
|
150
|
+
quota_manager.reclaim_quota(done_task_datum_queue)
|
|
151
|
+
|
|
152
|
+
# step 3: read more task datum from input file
|
|
153
|
+
if not todo_task_datum_queue and not no_more_input:
|
|
154
|
+
while True:
|
|
155
|
+
line = fr.readline()
|
|
156
|
+
if not line:
|
|
157
|
+
no_more_input = True
|
|
158
|
+
break
|
|
159
|
+
input_task_id += 1
|
|
160
|
+
if start_id is not None and input_task_id < start_id:
|
|
161
|
+
continue
|
|
162
|
+
if end_id is not None and input_task_id > end_id:
|
|
163
|
+
no_more_input = True
|
|
164
|
+
break
|
|
165
|
+
if input_task_id in completed_task_id_set:
|
|
166
|
+
continue
|
|
167
|
+
line_data = json.loads(line)
|
|
168
|
+
input_task_datum = task_datum_class(input_task_id, line_data)
|
|
169
|
+
todo_task_datum_queue.append(input_task_datum)
|
|
170
|
+
break
|
|
171
|
+
|
|
172
|
+
# step 4: end if there is no running tasks and no todo tasks
|
|
173
|
+
if not todo_task_datum_queue:
|
|
174
|
+
if running_task_to_datum:
|
|
175
|
+
continue
|
|
176
|
+
else:
|
|
177
|
+
break
|
|
178
|
+
|
|
179
|
+
# step 5: run next task
|
|
180
|
+
if quota_manager.has_enough_quota(todo_task_datum_queue[0]):
|
|
181
|
+
init_task_datum = todo_task_datum_queue.popleft()
|
|
182
|
+
init_task_datum.run_id += 1
|
|
183
|
+
quota_manager.deduct_quota(init_task_datum)
|
|
184
|
+
init_task = asyncio.create_task(task_runner(init_task_datum))
|
|
185
|
+
running_task_to_datum[init_task] = init_task_datum
|
|
186
|
+
logger.info(f"[run] {init_task_datum.get_log_string()}")
|
|
187
|
+
|
|
188
|
+
fr.close()
|
|
189
|
+
fw.close()
|
|
190
|
+
logger.info("done")
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
"""
|
|
195
|
+
math
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
async def math_task_runner(task_datum):
|
|
200
|
+
task_datum.start_time = time.time()
|
|
201
|
+
await asyncio.sleep(10)
|
|
202
|
+
result = task_datum.data["a"] * task_datum.data["b"]
|
|
203
|
+
assert random.random() < 0.5
|
|
204
|
+
task_datum.end_time = time.time()
|
|
205
|
+
|
|
206
|
+
task_datum.data["result"] = result
|
|
207
|
+
return task_datum
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
"""
|
|
211
|
+
OpenAI
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class OpenAITaskDatum(BasicTaskDatum):
|
|
216
|
+
tokenizer = None
|
|
217
|
+
client = None
|
|
218
|
+
|
|
219
|
+
def __init__(self, task_id, data):
|
|
220
|
+
super().__init__(task_id, data)
|
|
221
|
+
|
|
222
|
+
self.data["in_tokens"] = len(self.tokenizer.encode(self.data["text_in"]))
|
|
223
|
+
self.data["text_out_list"] = []
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
def set_out_tokens(self):
|
|
227
|
+
self.data["out_tokens"] = sum(
|
|
228
|
+
len(self.tokenizer.encode(text_out))
|
|
229
|
+
for text_out in self.data["text_out_list"]
|
|
230
|
+
)
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class OpenAIQuotaManager(BasicQuotaManager):
|
|
235
|
+
def __init__(self, rpm, tpm):
|
|
236
|
+
super().__init__()
|
|
237
|
+
self.rpm = rpm
|
|
238
|
+
self.tpm = tpm
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
def has_enough_quota(self, init_task_datum):
|
|
242
|
+
return self.rpm > 0 and self.tpm > init_task_datum.data["in_tokens"] * 2
|
|
243
|
+
|
|
244
|
+
def reclaim_quota(self, done_task_datum_queue):
|
|
245
|
+
while done_task_datum_queue:
|
|
246
|
+
end_time, _done_task_datum_queue_id, done_task_datum = done_task_datum_queue[0]
|
|
247
|
+
if end_time >= time.time() - 60:
|
|
248
|
+
break
|
|
249
|
+
heapq.heappop(done_task_datum_queue)
|
|
250
|
+
self.rpm += 1
|
|
251
|
+
self.tpm += done_task_datum.data["in_tokens"]
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
def deduct_quota(self, init_task_datum):
|
|
255
|
+
self.rpm -= 1
|
|
256
|
+
self.tpm -= init_task_datum.data["in_tokens"]
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
async def openai_task_runner(task_datum):
|
|
261
|
+
task_datum.start_time = time.time()
|
|
262
|
+
completion = await task_datum.client.chat.completions.create(
|
|
263
|
+
model=task_datum.data["model"],
|
|
264
|
+
n=task_datum.data["choices"],
|
|
265
|
+
messages=[
|
|
266
|
+
{"role": "user", "content": task_datum.data["text_in"]},
|
|
267
|
+
],
|
|
268
|
+
)
|
|
269
|
+
task_datum.end_time = time.time()
|
|
270
|
+
|
|
271
|
+
task_datum.data["text_out_list"] = [
|
|
272
|
+
choice.message.content
|
|
273
|
+
for choice in completion.choices
|
|
274
|
+
]
|
|
275
|
+
task_datum.set_out_tokens()
|
|
276
|
+
|
|
277
|
+
return task_datum
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
async def dummy_openai_task_runner(task_datum):
|
|
281
|
+
task_datum.start_time = time.time()
|
|
282
|
+
await asyncio.sleep(10)
|
|
283
|
+
assert random.random() < 0.5
|
|
284
|
+
task_datum.end_time = time.time()
|
|
285
|
+
|
|
286
|
+
task_datum.data["text_out_list"] = [
|
|
287
|
+
f"output-{i + 1}"
|
|
288
|
+
for i in range(task_datum.data["choices"])
|
|
289
|
+
]
|
|
290
|
+
task_datum.set_out_tokens()
|
|
291
|
+
|
|
292
|
+
return task_datum
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
"""
|
|
296
|
+
OpenAI Embedding
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class OpenAIEmbTaskDatum(BasicTaskDatum):
|
|
301
|
+
tokenizer = None
|
|
302
|
+
client = None
|
|
303
|
+
bytes_file = None
|
|
304
|
+
|
|
305
|
+
def __init__(self, task_id, data):
|
|
306
|
+
super().__init__(task_id, data)
|
|
307
|
+
|
|
308
|
+
self.data["in_tokens"] = sum(
|
|
309
|
+
len(self.tokenizer.encode(text))
|
|
310
|
+
for text in self.data["text_list"]
|
|
311
|
+
)
|
|
312
|
+
self.vector_list = []
|
|
313
|
+
return
|
|
314
|
+
|
|
315
|
+
def finish(self):
|
|
316
|
+
for vector in self.vector_list:
|
|
317
|
+
for v in vector:
|
|
318
|
+
v = struct.pack("d", v)
|
|
319
|
+
self.bytes_file.write(v)
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
async def openai_emb_task_runner(task_datum):
|
|
324
|
+
task_datum.start_time = time.time()
|
|
325
|
+
completion = await task_datum.client.embeddings.create(
|
|
326
|
+
input=task_datum.data["text_list"],
|
|
327
|
+
model=task_datum.data.get("model", "text-embedding-3-small"),
|
|
328
|
+
dimensions=task_datum.data.get("dimension", 256),
|
|
329
|
+
encoding_format="float",
|
|
330
|
+
)
|
|
331
|
+
task_datum.end_time = time.time()
|
|
332
|
+
|
|
333
|
+
task_datum.vector_list = [
|
|
334
|
+
datum.embedding
|
|
335
|
+
for datum in completion.data
|
|
336
|
+
]
|
|
337
|
+
return task_datum
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
"""
|
|
341
|
+
Deep Infra
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class DeepInfraTaskDatum(BasicTaskDatum):
|
|
346
|
+
client = None
|
|
347
|
+
|
|
348
|
+
def __init__(self, task_id, data):
|
|
349
|
+
super().__init__(task_id, data)
|
|
350
|
+
|
|
351
|
+
self.data["text_out_list"] = []
|
|
352
|
+
return
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
class DeepInfraQuotaManager(BasicQuotaManager):
|
|
356
|
+
def __init__(self, max_concurrent_requests):
|
|
357
|
+
super().__init__()
|
|
358
|
+
self.requests_quota = max_concurrent_requests
|
|
359
|
+
return
|
|
360
|
+
|
|
361
|
+
def has_enough_quota(self, init_task_datum):
|
|
362
|
+
return self.requests_quota > 0
|
|
363
|
+
|
|
364
|
+
def reclaim_quota(self, done_task_datum_queue):
|
|
365
|
+
while done_task_datum_queue:
|
|
366
|
+
heapq.heappop(done_task_datum_queue)
|
|
367
|
+
self.requests_quota += 1
|
|
368
|
+
return
|
|
369
|
+
|
|
370
|
+
def deduct_quota(self, init_task_datum):
|
|
371
|
+
self.requests_quota -= 1
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
async def deepinfra_task_runner(task_datum):
|
|
376
|
+
task_datum.start_time = time.time()
|
|
377
|
+
completion = await task_datum.client.chat.completions.create(
|
|
378
|
+
model=task_datum.data["model"],
|
|
379
|
+
n=task_datum.data["choices"],
|
|
380
|
+
messages=[
|
|
381
|
+
{"role": "user", "content": task_datum.data["text_in"]},
|
|
382
|
+
],
|
|
383
|
+
)
|
|
384
|
+
task_datum.end_time = time.time()
|
|
385
|
+
|
|
386
|
+
task_datum.data["text_out_list"] = [
|
|
387
|
+
choice.message.content
|
|
388
|
+
for choice in completion.choices
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
return task_datum
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
"""
|
|
395
|
+
Deep Infra Embedding
|
|
396
|
+
"""
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class DeepInfraEmbTaskDatum(BasicTaskDatum):
|
|
400
|
+
client = None
|
|
401
|
+
bytes_file = None
|
|
402
|
+
|
|
403
|
+
def __init__(self, task_id, data):
|
|
404
|
+
super().__init__(task_id, data)
|
|
405
|
+
|
|
406
|
+
self.vector_list = []
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
def finish(self):
|
|
410
|
+
for vector in self.vector_list:
|
|
411
|
+
for v in vector:
|
|
412
|
+
v = struct.pack("d", v)
|
|
413
|
+
self.bytes_file.write(v)
|
|
414
|
+
return
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
async def deepinfra_emb_task_runner(task_datum):
|
|
418
|
+
task_datum.start_time = time.time()
|
|
419
|
+
completion = await task_datum.client.embeddings.create(
|
|
420
|
+
input=task_datum.data["text_list"],
|
|
421
|
+
model=task_datum.data["model"],
|
|
422
|
+
encoding_format="float",
|
|
423
|
+
)
|
|
424
|
+
task_datum.end_time = time.time()
|
|
425
|
+
|
|
426
|
+
task_datum.vector_list = [
|
|
427
|
+
datum.embedding
|
|
428
|
+
for datum in completion.data
|
|
429
|
+
]
|
|
430
|
+
return task_datum
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
"""
|
|
434
|
+
FedGPT
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
class FedGPTTaskDatum(BasicTaskDatum):
|
|
439
|
+
api_key = ""
|
|
440
|
+
api_url = ""
|
|
441
|
+
|
|
442
|
+
def __init__(self, task_id, data):
|
|
443
|
+
super().__init__(task_id, data)
|
|
444
|
+
|
|
445
|
+
self.data["text_out"] = ""
|
|
446
|
+
return
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
class FedGPTQuotaManager(BasicQuotaManager):
|
|
450
|
+
def __init__(self, max_concurrent_requests):
|
|
451
|
+
super().__init__()
|
|
452
|
+
self.requests_quota = max_concurrent_requests
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
def has_enough_quota(self, init_task_datum):
|
|
456
|
+
return self.requests_quota > 0
|
|
457
|
+
|
|
458
|
+
def reclaim_quota(self, done_task_datum_queue):
|
|
459
|
+
while done_task_datum_queue:
|
|
460
|
+
heapq.heappop(done_task_datum_queue)
|
|
461
|
+
self.requests_quota += 1
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
def deduct_quota(self, init_task_datum):
|
|
465
|
+
self.requests_quota -= 1
|
|
466
|
+
return
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
async def fedgpt_task_runner(task_datum):
|
|
470
|
+
task_datum.start_time = time.time()
|
|
471
|
+
obj = {
|
|
472
|
+
"model": task_datum.data["model"],
|
|
473
|
+
"mode": "normal",
|
|
474
|
+
"messages": [
|
|
475
|
+
{"role": "user", "content": task_datum.data["text_in"]},
|
|
476
|
+
],
|
|
477
|
+
}
|
|
478
|
+
headers = {"Accept": "application/json", "x-api-key": task_datum.api_key}
|
|
479
|
+
async with aiohttp.ClientSession() as session:
|
|
480
|
+
async with session.post(task_datum.api_url, headers=headers, json=obj, ssl=False) as responses:
|
|
481
|
+
responses = await responses.json()
|
|
482
|
+
task_datum.end_time = time.time()
|
|
483
|
+
|
|
484
|
+
task_datum.data["text_out"] = responses["messages"][0]["content"]
|
|
485
|
+
return task_datum
|