jb_async_utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ from .async_utils import *
@@ -0,0 +1,485 @@
1
+ import os
2
+ import json
3
+ import time
4
+ import heapq
5
+ import random
6
+ import struct
7
+ import asyncio
8
+ import logging
9
+ import datetime
10
+ from collections import deque
11
+
12
+ import aiohttp
13
+
14
+ logger = logging.getLogger(__name__)
15
+ logging.basicConfig(
16
+ format="%(asctime)s - %(name)s - %(message)s",
17
+ datefmt="%Y/%m/%d %H:%M:%S",
18
+ level=logging.INFO,
19
+ )
20
+
21
+ """
22
+ basic
23
+ """
24
+
25
+
26
+ class BasicTaskDatum:
27
+ def __init__(
28
+ self, task_id, data,
29
+ ):
30
+ self.task_id = task_id
31
+ self.data = data
32
+
33
+ self.run_id = 0
34
+ self.start_time = 0
35
+ self.end_time = 0
36
+ return
37
+
38
+ def get_log_string(self):
39
+ return f"task#{self.task_id} run#{self.run_id}"
40
+
41
+ def get_json_obj(self):
42
+ start_time = datetime.datetime.fromtimestamp(self.start_time).isoformat()
43
+ end_time = datetime.datetime.fromtimestamp(self.end_time).isoformat()
44
+
45
+ json_obj = {
46
+ "task_id": self.task_id,
47
+ "data": self.data,
48
+ "start_time": start_time,
49
+ "end_time": end_time,
50
+ }
51
+ return json_obj
52
+
53
+ def finish(self):
54
+ return
55
+
56
+
57
+ class BasicQuotaManager:
58
+ def __init__(self):
59
+ self.runs_per_minute = 5
60
+ return
61
+
62
+ def has_enough_quota(self, init_task_datum):
63
+ return self.runs_per_minute > 0
64
+
65
+ def reclaim_quota(self, done_task_datum_queue):
66
+ while done_task_datum_queue:
67
+ end_time, _done_task_datum_queue_id, _done_task_datum = done_task_datum_queue[0]
68
+ if end_time >= time.time() - 60:
69
+ break
70
+ heapq.heappop(done_task_datum_queue)
71
+ self.runs_per_minute += 1
72
+ return
73
+
74
+ def deduct_quota(self, init_task_datum):
75
+ self.runs_per_minute -= 1
76
+ return
77
+
78
+
79
+ async def process_batch_data(
80
+ input_file, output_file, task_datum_class, task_runner, quota_manager,
81
+ max_task_runs=1, start_id=None, end_id=None, ignore_and_rewrite_output_file=False,
82
+ sleep_interval=0.001,
83
+ ):
84
+ # input file
85
+ fr = open(input_file, "r", encoding="utf8")
86
+ input_task_id = 0
87
+ no_more_input = False
88
+
89
+ # tasks
90
+ todo_task_datum_queue = deque()
91
+ running_task_to_datum = {}
92
+ done_task_datum_queue = []
93
+ done_task_datum_queue_next_id = 0
94
+
95
+ # output file
96
+ completed_task_id_set = set()
97
+ if not ignore_and_rewrite_output_file and os.path.exists(output_file):
98
+ with open(output_file, "r", encoding="utf8") as f:
99
+ for line in f:
100
+ datum = json.loads(line)
101
+ completed_task_id_set.add(datum["task_id"])
102
+
103
+ output_mode = "w" if ignore_and_rewrite_output_file else "a"
104
+ fw = open(output_file, output_mode, encoding="utf8")
105
+
106
+ # loop
107
+ while True:
108
+ await asyncio.sleep(sleep_interval)
109
+
110
+ # step 1: loop through running tasks: check and process their completion
111
+ new_running_task_to_datum = {}
112
+
113
+ for running_task, running_task_datum in running_task_to_datum.items():
114
+ if running_task.done():
115
+ try:
116
+ _ = running_task.result()
117
+ successful = True
118
+ except:
119
+ successful = False
120
+
121
+ if successful:
122
+ running_task_datum.finish()
123
+ logger.info(f"[success] {running_task_datum.get_log_string()}")
124
+ json.dump(running_task_datum.get_json_obj(), fw, ensure_ascii=False)
125
+ fw.write("\n")
126
+ fw.flush()
127
+ else:
128
+ running_task_datum.end_time = time.time()
129
+ if running_task_datum.run_id < max_task_runs:
130
+ logger.info(f"[error] {running_task_datum.get_log_string()}")
131
+ todo_task_datum_queue.append(running_task_datum)
132
+ else:
133
+ logger.info(f"[error] [quit] {running_task_datum.get_log_string()}")
134
+
135
+ heapq.heappush(
136
+ done_task_datum_queue,
137
+ (
138
+ running_task_datum.end_time,
139
+ done_task_datum_queue_next_id,
140
+ running_task_datum,
141
+ ),
142
+ )
143
+ done_task_datum_queue_next_id += 1
144
+ else:
145
+ new_running_task_to_datum[running_task] = running_task_datum
146
+
147
+ running_task_to_datum = new_running_task_to_datum
148
+
149
+ # step 2: loop through done tasks: reclaim quota
150
+ quota_manager.reclaim_quota(done_task_datum_queue)
151
+
152
+ # step 3: read more task datum from input file
153
+ if not todo_task_datum_queue and not no_more_input:
154
+ while True:
155
+ line = fr.readline()
156
+ if not line:
157
+ no_more_input = True
158
+ break
159
+ input_task_id += 1
160
+ if start_id is not None and input_task_id < start_id:
161
+ continue
162
+ if end_id is not None and input_task_id > end_id:
163
+ no_more_input = True
164
+ break
165
+ if input_task_id in completed_task_id_set:
166
+ continue
167
+ line_data = json.loads(line)
168
+ input_task_datum = task_datum_class(input_task_id, line_data)
169
+ todo_task_datum_queue.append(input_task_datum)
170
+ break
171
+
172
+ # step 4: end if there is no running tasks and no todo tasks
173
+ if not todo_task_datum_queue:
174
+ if running_task_to_datum:
175
+ continue
176
+ else:
177
+ break
178
+
179
+ # step 5: run next task
180
+ if quota_manager.has_enough_quota(todo_task_datum_queue[0]):
181
+ init_task_datum = todo_task_datum_queue.popleft()
182
+ init_task_datum.run_id += 1
183
+ quota_manager.deduct_quota(init_task_datum)
184
+ init_task = asyncio.create_task(task_runner(init_task_datum))
185
+ running_task_to_datum[init_task] = init_task_datum
186
+ logger.info(f"[run] {init_task_datum.get_log_string()}")
187
+
188
+ fr.close()
189
+ fw.close()
190
+ logger.info("done")
191
+ return
192
+
193
+
194
+ """
195
+ math
196
+ """
197
+
198
+
199
+ async def math_task_runner(task_datum):
200
+ task_datum.start_time = time.time()
201
+ await asyncio.sleep(10)
202
+ result = task_datum.data["a"] * task_datum.data["b"]
203
+ assert random.random() < 0.5
204
+ task_datum.end_time = time.time()
205
+
206
+ task_datum.data["result"] = result
207
+ return task_datum
208
+
209
+
210
+ """
211
+ OpenAI
212
+ """
213
+
214
+
215
+ class OpenAITaskDatum(BasicTaskDatum):
216
+ tokenizer = None
217
+ client = None
218
+
219
+ def __init__(self, task_id, data):
220
+ super().__init__(task_id, data)
221
+
222
+ self.data["in_tokens"] = len(self.tokenizer.encode(self.data["text_in"]))
223
+ self.data["text_out_list"] = []
224
+ return
225
+
226
+ def set_out_tokens(self):
227
+ self.data["out_tokens"] = sum(
228
+ len(self.tokenizer.encode(text_out))
229
+ for text_out in self.data["text_out_list"]
230
+ )
231
+ return
232
+
233
+
234
+ class OpenAIQuotaManager(BasicQuotaManager):
235
+ def __init__(self, rpm, tpm):
236
+ super().__init__()
237
+ self.rpm = rpm
238
+ self.tpm = tpm
239
+ return
240
+
241
+ def has_enough_quota(self, init_task_datum):
242
+ return self.rpm > 0 and self.tpm > init_task_datum.data["in_tokens"] * 2
243
+
244
+ def reclaim_quota(self, done_task_datum_queue):
245
+ while done_task_datum_queue:
246
+ end_time, _done_task_datum_queue_id, done_task_datum = done_task_datum_queue[0]
247
+ if end_time >= time.time() - 60:
248
+ break
249
+ heapq.heappop(done_task_datum_queue)
250
+ self.rpm += 1
251
+ self.tpm += done_task_datum.data["in_tokens"]
252
+ return
253
+
254
+ def deduct_quota(self, init_task_datum):
255
+ self.rpm -= 1
256
+ self.tpm -= init_task_datum.data["in_tokens"]
257
+ return
258
+
259
+
260
+ async def openai_task_runner(task_datum):
261
+ task_datum.start_time = time.time()
262
+ completion = await task_datum.client.chat.completions.create(
263
+ model=task_datum.data["model"],
264
+ n=task_datum.data["choices"],
265
+ messages=[
266
+ {"role": "user", "content": task_datum.data["text_in"]},
267
+ ],
268
+ )
269
+ task_datum.end_time = time.time()
270
+
271
+ task_datum.data["text_out_list"] = [
272
+ choice.message.content
273
+ for choice in completion.choices
274
+ ]
275
+ task_datum.set_out_tokens()
276
+
277
+ return task_datum
278
+
279
+
280
+ async def dummy_openai_task_runner(task_datum):
281
+ task_datum.start_time = time.time()
282
+ await asyncio.sleep(10)
283
+ assert random.random() < 0.5
284
+ task_datum.end_time = time.time()
285
+
286
+ task_datum.data["text_out_list"] = [
287
+ f"output-{i + 1}"
288
+ for i in range(task_datum.data["choices"])
289
+ ]
290
+ task_datum.set_out_tokens()
291
+
292
+ return task_datum
293
+
294
+
295
+ """
296
+ OpenAI Embedding
297
+ """
298
+
299
+
300
+ class OpenAIEmbTaskDatum(BasicTaskDatum):
301
+ tokenizer = None
302
+ client = None
303
+ bytes_file = None
304
+
305
+ def __init__(self, task_id, data):
306
+ super().__init__(task_id, data)
307
+
308
+ self.data["in_tokens"] = sum(
309
+ len(self.tokenizer.encode(text))
310
+ for text in self.data["text_list"]
311
+ )
312
+ self.vector_list = []
313
+ return
314
+
315
+ def finish(self):
316
+ for vector in self.vector_list:
317
+ for v in vector:
318
+ v = struct.pack("d", v)
319
+ self.bytes_file.write(v)
320
+ return
321
+
322
+
323
+ async def openai_emb_task_runner(task_datum):
324
+ task_datum.start_time = time.time()
325
+ completion = await task_datum.client.embeddings.create(
326
+ input=task_datum.data["text_list"],
327
+ model=task_datum.data.get("model", "text-embedding-3-small"),
328
+ dimensions=task_datum.data.get("dimension", 256),
329
+ encoding_format="float",
330
+ )
331
+ task_datum.end_time = time.time()
332
+
333
+ task_datum.vector_list = [
334
+ datum.embedding
335
+ for datum in completion.data
336
+ ]
337
+ return task_datum
338
+
339
+
340
+ """
341
+ Deep Infra
342
+ """
343
+
344
+
345
+ class DeepInfraTaskDatum(BasicTaskDatum):
346
+ client = None
347
+
348
+ def __init__(self, task_id, data):
349
+ super().__init__(task_id, data)
350
+
351
+ self.data["text_out_list"] = []
352
+ return
353
+
354
+
355
+ class DeepInfraQuotaManager(BasicQuotaManager):
356
+ def __init__(self, max_concurrent_requests):
357
+ super().__init__()
358
+ self.requests_quota = max_concurrent_requests
359
+ return
360
+
361
+ def has_enough_quota(self, init_task_datum):
362
+ return self.requests_quota > 0
363
+
364
+ def reclaim_quota(self, done_task_datum_queue):
365
+ while done_task_datum_queue:
366
+ heapq.heappop(done_task_datum_queue)
367
+ self.requests_quota += 1
368
+ return
369
+
370
+ def deduct_quota(self, init_task_datum):
371
+ self.requests_quota -= 1
372
+ return
373
+
374
+
375
+ async def deepinfra_task_runner(task_datum):
376
+ task_datum.start_time = time.time()
377
+ completion = await task_datum.client.chat.completions.create(
378
+ model=task_datum.data["model"],
379
+ n=task_datum.data["choices"],
380
+ messages=[
381
+ {"role": "user", "content": task_datum.data["text_in"]},
382
+ ],
383
+ )
384
+ task_datum.end_time = time.time()
385
+
386
+ task_datum.data["text_out_list"] = [
387
+ choice.message.content
388
+ for choice in completion.choices
389
+ ]
390
+
391
+ return task_datum
392
+
393
+
394
+ """
395
+ Deep Infra Embedding
396
+ """
397
+
398
+
399
+ class DeepInfraEmbTaskDatum(BasicTaskDatum):
400
+ client = None
401
+ bytes_file = None
402
+
403
+ def __init__(self, task_id, data):
404
+ super().__init__(task_id, data)
405
+
406
+ self.vector_list = []
407
+ return
408
+
409
+ def finish(self):
410
+ for vector in self.vector_list:
411
+ for v in vector:
412
+ v = struct.pack("d", v)
413
+ self.bytes_file.write(v)
414
+ return
415
+
416
+
417
+ async def deepinfra_emb_task_runner(task_datum):
418
+ task_datum.start_time = time.time()
419
+ completion = await task_datum.client.embeddings.create(
420
+ input=task_datum.data["text_list"],
421
+ model=task_datum.data["model"],
422
+ encoding_format="float",
423
+ )
424
+ task_datum.end_time = time.time()
425
+
426
+ task_datum.vector_list = [
427
+ datum.embedding
428
+ for datum in completion.data
429
+ ]
430
+ return task_datum
431
+
432
+
433
+ """
434
+ FedGPT
435
+ """
436
+
437
+
438
+ class FedGPTTaskDatum(BasicTaskDatum):
439
+ api_key = ""
440
+ api_url = ""
441
+
442
+ def __init__(self, task_id, data):
443
+ super().__init__(task_id, data)
444
+
445
+ self.data["text_out"] = ""
446
+ return
447
+
448
+
449
+ class FedGPTQuotaManager(BasicQuotaManager):
450
+ def __init__(self, max_concurrent_requests):
451
+ super().__init__()
452
+ self.requests_quota = max_concurrent_requests
453
+ return
454
+
455
+ def has_enough_quota(self, init_task_datum):
456
+ return self.requests_quota > 0
457
+
458
+ def reclaim_quota(self, done_task_datum_queue):
459
+ while done_task_datum_queue:
460
+ heapq.heappop(done_task_datum_queue)
461
+ self.requests_quota += 1
462
+ return
463
+
464
+ def deduct_quota(self, init_task_datum):
465
+ self.requests_quota -= 1
466
+ return
467
+
468
+
469
+ async def fedgpt_task_runner(task_datum):
470
+ task_datum.start_time = time.time()
471
+ obj = {
472
+ "model": task_datum.data["model"],
473
+ "mode": "normal",
474
+ "messages": [
475
+ {"role": "user", "content": task_datum.data["text_in"]},
476
+ ],
477
+ }
478
+ headers = {"Accept": "application/json", "x-api-key": task_datum.api_key}
479
+ async with aiohttp.ClientSession() as session:
480
+ async with session.post(task_datum.api_url, headers=headers, json=obj, ssl=False) as responses:
481
+ responses = await responses.json()
482
+ task_datum.end_time = time.time()
483
+
484
+ task_datum.data["text_out"] = responses["messages"][0]["content"]
485
+ return task_datum