xtremeflow 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xtremeflow/__init__.py +5 -0
- xtremeflow/kvbatch.py +59 -0
- xtremeflow/pipeline.py +39 -0
- xtremeflow/scheduler/__init__.py +5 -0
- xtremeflow/scheduler/base.py +35 -0
- xtremeflow/scheduler/rate_limit.py +111 -0
- xtremeflow/scheduler/request.py +49 -0
- xtremeflow/scheduler/token.py +79 -0
- xtremeflow-0.1.0.dist-info/METADATA +139 -0
- xtremeflow-0.1.0.dist-info/RECORD +13 -0
- xtremeflow-0.1.0.dist-info/WHEEL +5 -0
- xtremeflow-0.1.0.dist-info/licenses/LICENSE +21 -0
- xtremeflow-0.1.0.dist-info/top_level.txt +1 -0
xtremeflow/__init__.py
ADDED
xtremeflow/kvbatch.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
'''Helper for KV cache-optimized async task batches.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for executing async tasks with a "first-wait,
|
|
4
|
+
then-parallel" pattern optimized for KV cache utilization in LLM applications.
|
|
5
|
+
|
|
6
|
+
Execution Pattern:
|
|
7
|
+
|
|
8
|
+
Input: [task1, task2, task3, ...]
|
|
9
|
+
↓
|
|
10
|
+
┌────────────────────────────────────┐
|
|
11
|
+
│ Phase 1: First Task │
|
|
12
|
+
│ task1 runs to completion │
|
|
13
|
+
│ (establishes KV cache) │
|
|
14
|
+
└────────────────────────────────────┘
|
|
15
|
+
↓
|
|
16
|
+
┌────────────────────────────────────┐
|
|
17
|
+
│ Phase 2: Parallel Tasks │
|
|
18
|
+
│ task2, task3, ... run concurrently │
|
|
19
|
+
│ (share the established cache) │
|
|
20
|
+
└────────────────────────────────────┘
|
|
21
|
+
↓
|
|
22
|
+
Output: [result1, result2, result3, ...]
|
|
23
|
+
|
|
24
|
+
Use Case Example:
|
|
25
|
+
|
|
26
|
+
When scoring multiple resumes for the same job, each request shares the
|
|
27
|
+
job description prefix. The first request establishes a KV cache for the
|
|
28
|
+
job description. Subsequent requests can then run in parallel, leveraging
|
|
29
|
+
the cached computation for better performance.
|
|
30
|
+
'''
|
|
31
|
+
|
|
32
|
+
import asyncio
|
|
33
|
+
from typing import Awaitable, Iterable, List, TypeVar
|
|
34
|
+
|
|
35
|
+
T = TypeVar('T')
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def _process_aws(*aws: Awaitable[T]) -> List[T]:
|
|
39
|
+
results = [await aws[0]] if aws else []
|
|
40
|
+
results += await asyncio.gather(*aws[1:])
|
|
41
|
+
return results
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def kv_batch(aws: Iterable[Awaitable[T]]) -> asyncio.Task[List[T]]:
|
|
45
|
+
'''Create a batch task with KV cache optimization.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
aws: An iterable of awaitables to process.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
An asyncio.Task that completes with a list of results.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> task = kv_batch(
|
|
55
|
+
... llm_score(prompt) for prompt in same_job_with_different_resumes
|
|
56
|
+
... )
|
|
57
|
+
>>> results = await task
|
|
58
|
+
'''
|
|
59
|
+
return asyncio.create_task(_process_aws(*aws))
|
xtremeflow/pipeline.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Callable, AsyncGenerator, AsyncIterable, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
async def async_chunks(iterable: AsyncIterable, size: int):
|
|
6
|
+
it = aiter(iterable)
|
|
7
|
+
while True:
|
|
8
|
+
chunk = []
|
|
9
|
+
for _ in range(size):
|
|
10
|
+
try:
|
|
11
|
+
item = await anext(it)
|
|
12
|
+
chunk.append(item)
|
|
13
|
+
except StopAsyncIteration:
|
|
14
|
+
if chunk:
|
|
15
|
+
yield chunk
|
|
16
|
+
return
|
|
17
|
+
yield chunk
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def async_pipeline(
|
|
21
|
+
producer: Callable[[asyncio.Queue], Any], process_item: Optional[Callable[[Any], Any]] = None
|
|
22
|
+
) -> AsyncGenerator[Any]:
|
|
23
|
+
queue = asyncio.Queue()
|
|
24
|
+
|
|
25
|
+
async def producer_wrapper():
|
|
26
|
+
await producer(queue)
|
|
27
|
+
await queue.put(None)
|
|
28
|
+
|
|
29
|
+
asyncio.create_task(producer_wrapper())
|
|
30
|
+
|
|
31
|
+
while True:
|
|
32
|
+
item = await queue.get()
|
|
33
|
+
if item is None:
|
|
34
|
+
break
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
yield await process_item(item) if process_item else item
|
|
38
|
+
finally:
|
|
39
|
+
queue.task_done()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Coroutine
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TaskScheduler:
|
|
6
|
+
def __init__(self, max_concurrency: int):
|
|
7
|
+
if max_concurrency <= 0:
|
|
8
|
+
raise ValueError(f'max_concurrency must be positive, got {max_concurrency}')
|
|
9
|
+
|
|
10
|
+
self.semaphore = asyncio.Semaphore(max_concurrency)
|
|
11
|
+
self.active_tasks = 0
|
|
12
|
+
self.total_completed = 0
|
|
13
|
+
self.pending_tasks = set()
|
|
14
|
+
|
|
15
|
+
async def _execute_coro(self, coro: Coroutine, **kwargs) -> Any:
|
|
16
|
+
return await coro
|
|
17
|
+
|
|
18
|
+
def _task_done(self, task):
|
|
19
|
+
self.pending_tasks.discard(task)
|
|
20
|
+
self.active_tasks -= 1
|
|
21
|
+
self.total_completed += 1
|
|
22
|
+
self.semaphore.release()
|
|
23
|
+
|
|
24
|
+
async def start_task(self, coro: Coroutine, **kwargs) -> asyncio.Task:
|
|
25
|
+
await self.semaphore.acquire()
|
|
26
|
+
self.active_tasks += 1
|
|
27
|
+
task = asyncio.create_task(self._execute_coro(coro, **kwargs))
|
|
28
|
+
self.pending_tasks.add(task)
|
|
29
|
+
task.add_done_callback(self._task_done)
|
|
30
|
+
return task
|
|
31
|
+
|
|
32
|
+
async def wait_pending(self):
|
|
33
|
+
if self.pending_tasks:
|
|
34
|
+
await asyncio.gather(*self.pending_tasks)
|
|
35
|
+
self.pending_tasks.clear()
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import time
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from contextvars import ContextVar
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from functools import wraps
|
|
10
|
+
from typing import Any, Coroutine, Optional, Type, Union
|
|
11
|
+
|
|
12
|
+
from .base import TaskScheduler
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_current_ctx: ContextVar['Optional[ExecutionContext]'] = ContextVar('_current_ctx', default=None)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ExecutionContext:
|
|
21
|
+
scheduler: RateLimitScheduler
|
|
22
|
+
extra: Optional[dict] = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RetryException(Exception):
|
|
26
|
+
def __init__(self, message: str = '', retry_after: Optional[float] = None):
|
|
27
|
+
super().__init__(message)
|
|
28
|
+
self.retry_after = retry_after
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def auto_backoff(
|
|
32
|
+
retry_for: Union[Type[Exception], list[Type[Exception]], None] = None,
|
|
33
|
+
max_retries: int = 3,
|
|
34
|
+
base_retry_after: float = 2.0,
|
|
35
|
+
exponential: bool = True
|
|
36
|
+
):
|
|
37
|
+
if retry_for is None:
|
|
38
|
+
retry_types = (RetryException,)
|
|
39
|
+
elif isinstance(retry_for, list):
|
|
40
|
+
retry_types = tuple(retry_for)
|
|
41
|
+
else:
|
|
42
|
+
retry_types = retry_for
|
|
43
|
+
|
|
44
|
+
def decorator(func):
|
|
45
|
+
@wraps(func)
|
|
46
|
+
async def wrapper(*args, **kwargs):
|
|
47
|
+
last_exc = None
|
|
48
|
+
for attempt in range(max_retries + 1):
|
|
49
|
+
try:
|
|
50
|
+
return await func(*args, **kwargs)
|
|
51
|
+
except retry_types as e:
|
|
52
|
+
last_exc = e
|
|
53
|
+
ctx = _current_ctx.get()
|
|
54
|
+
|
|
55
|
+
if attempt < max_retries and ctx:
|
|
56
|
+
header_wait = getattr(e, 'retry_after', None)
|
|
57
|
+
if header_wait is not None and isinstance(header_wait, (int, float)):
|
|
58
|
+
wait_sec = float(header_wait)
|
|
59
|
+
else:
|
|
60
|
+
wait_sec = base_retry_after * (2 ** attempt) if exponential else base_retry_after
|
|
61
|
+
logger.warning(
|
|
62
|
+
f'Retrying in {wait_sec:.1f}s '
|
|
63
|
+
f'(attempt {attempt + 1}/{max_retries}): {e}'
|
|
64
|
+
)
|
|
65
|
+
ctx.scheduler.notify_rate_limit_exceeded(wait_sec)
|
|
66
|
+
await asyncio.sleep(wait_sec)
|
|
67
|
+
continue
|
|
68
|
+
raise last_exc
|
|
69
|
+
return wrapper
|
|
70
|
+
return decorator
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_context() -> Optional[ExecutionContext]:
|
|
74
|
+
return _current_ctx.get()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RateLimitScheduler(TaskScheduler, ABC):
|
|
78
|
+
def __init__(self, max_concurrency: int, init_ratio: float = 0.0):
|
|
79
|
+
super().__init__(max_concurrency)
|
|
80
|
+
self._backoff_until = 0.0
|
|
81
|
+
self._initial_ratio = init_ratio
|
|
82
|
+
|
|
83
|
+
def notify_rate_limit_exceeded(self, retry_after: float):
|
|
84
|
+
self._backoff_until = max(self._backoff_until, time.monotonic() + retry_after)
|
|
85
|
+
|
|
86
|
+
def _get_backoff_wait(self) -> float:
|
|
87
|
+
return max(0.0, self._backoff_until - time.monotonic())
|
|
88
|
+
|
|
89
|
+
def _get_wait_time(self) -> float:
|
|
90
|
+
return self._get_backoff_wait()
|
|
91
|
+
|
|
92
|
+
@abstractmethod
|
|
93
|
+
def _consume_rate_quota(self):
|
|
94
|
+
'''Consume rate limit quota. Subclasses must implement this.'''
|
|
95
|
+
|
|
96
|
+
async def _wait_for_quota(self):
|
|
97
|
+
while True:
|
|
98
|
+
wait_time = self._get_wait_time()
|
|
99
|
+
if wait_time <= 0:
|
|
100
|
+
self._consume_rate_quota()
|
|
101
|
+
break
|
|
102
|
+
await asyncio.sleep(wait_time)
|
|
103
|
+
|
|
104
|
+
async def _execute_coro(self, coro: Coroutine, ctx_extra=None, **kwargs) -> Any:
|
|
105
|
+
ctx = ExecutionContext(scheduler=self, extra=ctx_extra)
|
|
106
|
+
token = _current_ctx.set(ctx)
|
|
107
|
+
try:
|
|
108
|
+
await self._wait_for_quota()
|
|
109
|
+
return await super()._execute_coro(coro, **kwargs)
|
|
110
|
+
finally:
|
|
111
|
+
_current_ctx.reset(token)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from .rate_limit import RateLimitScheduler
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RequestRateScheduler(RateLimitScheduler):
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
max_rps: Optional[int] = None,
|
|
11
|
+
max_rpm: Optional[int] = None,
|
|
12
|
+
*args,
|
|
13
|
+
**kwargs
|
|
14
|
+
):
|
|
15
|
+
super().__init__(*args, **kwargs)
|
|
16
|
+
self._max_rps = max_rps
|
|
17
|
+
self._max_rpm = max_rpm
|
|
18
|
+
self._rps_bucket = (max_rps or 0) * self._initial_ratio
|
|
19
|
+
self._rpm_bucket = (max_rpm or 0) * self._initial_ratio
|
|
20
|
+
self._last_req_update = time.monotonic()
|
|
21
|
+
|
|
22
|
+
def _get_wait_time(self) -> float:
|
|
23
|
+
now = time.monotonic()
|
|
24
|
+
delta = now - self._last_req_update
|
|
25
|
+
self._last_req_update = now
|
|
26
|
+
|
|
27
|
+
if self._max_rps:
|
|
28
|
+
self._rps_bucket = min(float(self._max_rps), self._rps_bucket + delta * self._max_rps)
|
|
29
|
+
if self._max_rpm:
|
|
30
|
+
self._rpm_bucket = min(float(self._max_rpm), self._rpm_bucket + delta * (self._max_rpm / 60.0))
|
|
31
|
+
|
|
32
|
+
waits = [super()._get_wait_time()]
|
|
33
|
+
if self._max_rps and self._rps_bucket < 1:
|
|
34
|
+
waits.append((1 - self._rps_bucket) / self._max_rps)
|
|
35
|
+
if self._max_rpm and self._rpm_bucket < 1:
|
|
36
|
+
waits.append((1 - self._rpm_bucket) / (self._max_rpm / 60.0))
|
|
37
|
+
|
|
38
|
+
return max(waits)
|
|
39
|
+
|
|
40
|
+
def _consume_rate_quota(self):
|
|
41
|
+
if self._max_rps:
|
|
42
|
+
self._rps_bucket -= 1
|
|
43
|
+
if self._max_rpm:
|
|
44
|
+
self._rpm_bucket -= 1
|
|
45
|
+
|
|
46
|
+
def reset_quota(self):
|
|
47
|
+
self._rps_bucket = (self._max_rps or 0) * self._initial_ratio
|
|
48
|
+
self._rpm_bucket = (self._max_rpm or 0) * self._initial_ratio
|
|
49
|
+
self._last_req_update = time.monotonic()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional, cast
|
|
4
|
+
|
|
5
|
+
from .request import RequestRateScheduler
|
|
6
|
+
from .rate_limit import get_context
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TokenRateScheduler(RequestRateScheduler):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
max_tps: Optional[int] = None,
|
|
13
|
+
max_tpm: Optional[int] = None,
|
|
14
|
+
*args,
|
|
15
|
+
**kwargs
|
|
16
|
+
):
|
|
17
|
+
super().__init__(*args, **kwargs)
|
|
18
|
+
self._max_tps = max_tps
|
|
19
|
+
self._max_tpm = max_tpm
|
|
20
|
+
self._tps_bucket = (self._max_tps or 0) * self._initial_ratio
|
|
21
|
+
self._tpm_bucket = (self._max_tpm or 0) * self._initial_ratio
|
|
22
|
+
self._last_token_update = time.monotonic()
|
|
23
|
+
|
|
24
|
+
async def start_task(self, coro, estimated_tokens: int, **kwargs) -> asyncio.Task:
|
|
25
|
+
return await super().start_task(
|
|
26
|
+
coro, ctx_extra={'estimated_tokens': estimated_tokens},
|
|
27
|
+
**kwargs)
|
|
28
|
+
|
|
29
|
+
def _get_wait_time(self) -> float:
|
|
30
|
+
now = time.monotonic()
|
|
31
|
+
delta = now - self._last_token_update
|
|
32
|
+
self._last_token_update = now
|
|
33
|
+
|
|
34
|
+
if self._max_tps:
|
|
35
|
+
self._tps_bucket = min(float(self._max_tps), self._tps_bucket + delta * self._max_tps)
|
|
36
|
+
if self._max_tpm:
|
|
37
|
+
self._tpm_bucket = min(float(self._max_tpm), self._tpm_bucket + delta * (self._max_tpm / 60.0))
|
|
38
|
+
|
|
39
|
+
waits = [super()._get_wait_time()]
|
|
40
|
+
|
|
41
|
+
ctx = get_context()
|
|
42
|
+
tokens = ctx.extra.get('estimated_tokens', 0)
|
|
43
|
+
if tokens > 0:
|
|
44
|
+
if self._max_tps and self._tps_bucket < tokens:
|
|
45
|
+
waits.append((tokens - self._tps_bucket) / self._max_tps)
|
|
46
|
+
if self._max_tpm and self._tpm_bucket < tokens:
|
|
47
|
+
waits.append((tokens - self._tpm_bucket) / (self._max_tpm / 60.0))
|
|
48
|
+
return max(waits)
|
|
49
|
+
|
|
50
|
+
def _consume_rate_quota(self):
|
|
51
|
+
super()._consume_rate_quota()
|
|
52
|
+
ctx = get_context()
|
|
53
|
+
tokens = ctx.extra.get('estimated_tokens', 0)
|
|
54
|
+
if tokens > 0:
|
|
55
|
+
if self._max_tps:
|
|
56
|
+
self._tps_bucket -= tokens
|
|
57
|
+
if self._max_tpm:
|
|
58
|
+
self._tpm_bucket -= tokens
|
|
59
|
+
|
|
60
|
+
def _apply_correction(self, actual: int):
|
|
61
|
+
ctx = get_context()
|
|
62
|
+
estimated = ctx.extra.get('estimated_tokens', 0)
|
|
63
|
+
diff = estimated - actual
|
|
64
|
+
if diff == 0:
|
|
65
|
+
return
|
|
66
|
+
if self._max_tps:
|
|
67
|
+
self._tps_bucket = min(float(self._max_tps), self._tps_bucket + diff)
|
|
68
|
+
if self._max_tpm:
|
|
69
|
+
self._tpm_bucket = min(float(self._max_tpm), self._tpm_bucket + diff)
|
|
70
|
+
|
|
71
|
+
def reset_quota(self):
|
|
72
|
+
self._tps_bucket = (self._max_tps or 0) * self._initial_ratio
|
|
73
|
+
self._tpm_bucket = (self._max_tpm or 0) * self._initial_ratio
|
|
74
|
+
self._last_token_update = time.monotonic()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def report_token_usage(actual: int):
|
|
78
|
+
ctx = get_context()
|
|
79
|
+
cast(TokenRateScheduler, ctx.scheduler)._apply_correction(actual)
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xtremeflow
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: XtremeFlow: A high-performance Python asynchronous task scheduler engineered to push LLM workloads to their absolute physical limits
|
|
5
|
+
Author-email: Flow Jiang <flowjzh@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/flowjzh/xtremeflow
|
|
8
|
+
Project-URL: Repository, https://github.com/flowjzh/xtremeflow.git
|
|
9
|
+
Project-URL: Issues, https://github.com/flowjzh/xtremeflow/issues
|
|
10
|
+
Keywords: async,scheduler,rate-limiting,llm,asyncio,concurrency,backpressure
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
22
|
+
Classifier: Operating System :: OS Independent
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=8.4.2; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-asyncio>=1.2.0; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# XtremeFlow
|
|
33
|
+
|
|
34
|
+
> **"Exhaust rate limits, not patience. Squeezing maximum throughput from every second."**
|
|
35
|
+
|
|
36
|
+
### 🦅 About
|
|
37
|
+
|
|
38
|
+
**XtremeFlow** is a high-performance asynchronous task scheduler engineered to push **Large Language Model (LLM)** workloads to their absolute physical limits.
|
|
39
|
+
|
|
40
|
+
**The Problem:**
|
|
41
|
+
LLM providers throttle your velocity through a combination of **Concurrency**, **RPS**/**RPM** or **TPS**/**TPM**. Most schedulers are defensive—they wait too long, leave gaps in your schedule, and waste capacity. In high-volume production, idle time is a lost resource.
|
|
42
|
+
|
|
43
|
+
**The XtremeFlow Philosophy:**
|
|
44
|
+
Stop being polite with your rate limits. **XtremeFlow is offensive.** It is designed to saturate your provider's capacity with surgical precision. Using a unique **Backpressure Reflex**, it maintains peak velocity until the very moment a limit is hit, executes a synchronized global cool-down, and resumes at full speed the millisecond the provider allows.
|
|
45
|
+
|
|
46
|
+
> ⚠️ **Limitation:** XtremeFlow is currently optimized for **single-process** `asyncio` applications. It manages state in-memory and does not support distributed rate limiting (e.g., Redis-based) out of the box.
|
|
47
|
+
|
|
48
|
+
### ⚡ Key Features
|
|
49
|
+
|
|
50
|
+
* **Aggressive Saturation**: Engineered to fill every available millisecond of your allowed rate, ensuring zero wasted throughput.
|
|
51
|
+
* **Backpressure Reflex**: Automatically detects 429 triggers and orchestrates a global **Exponential Backoff** across all workers to stay in perfect sync with provider resets.
|
|
52
|
+
* **Dynamic Calibration**: Supports post-request reporting of *actual* usage to instantly "refund" over-estimated capacity back to the scheduler.
|
|
53
|
+
* **Async-Native**: Built on `asyncio` for low-latency scheduling where every microsecond counts.
|
|
54
|
+
* **KV Cache Optimization**: Provides utilities to maximize KV cache utilization across parallel LLM requests, dramatically reducing token consumption and improving throughput.
|
|
55
|
+
* **Async Pipeline**: Producer-consumer pipeline for streaming workloads with automatic backpressure handling.
|
|
56
|
+
|
|
57
|
+
### 🚀 Quick Start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import asyncio
|
|
61
|
+
from openai import RateLimitError
|
|
62
|
+
from xtremeflow.scheduler.rate_limit import auto_backoff, report_token_usage
|
|
63
|
+
from xtremeflow.scheduler.token import TokenRateScheduler
|
|
64
|
+
|
|
65
|
+
# Initialize: 10 concurrent slots, 60 RPM, 50k TPM
|
|
66
|
+
scheduler = TokenRateScheduler(
|
|
67
|
+
max_concurrency=10,
|
|
68
|
+
rpm=60,
|
|
69
|
+
tpm=50000
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
@auto_backoff(retry_for=RateLimitError, base_retry_after=2.0)
|
|
73
|
+
async def call_llm_api(prompt: str):
|
|
74
|
+
"""
|
|
75
|
+
Wraps LLM call with Backpressure Reflex.
|
|
76
|
+
Global synchronization ensures you don't keep hitting the wall during cooldown.
|
|
77
|
+
"""
|
|
78
|
+
print(f"Executing task: {prompt}")
|
|
79
|
+
|
|
80
|
+
# Simulated API call
|
|
81
|
+
await asyncio.sleep(1)
|
|
82
|
+
|
|
83
|
+
# Calibration: Refund unused quota to the scheduler
|
|
84
|
+
report_token_usage(actual_tokens=450)
|
|
85
|
+
|
|
86
|
+
return "success"
|
|
87
|
+
|
|
88
|
+
async def main():
|
|
89
|
+
tasks = []
|
|
90
|
+
for i in range(10):
|
|
91
|
+
# Dispatch with an estimated cost to saturate the current limit
|
|
92
|
+
t = await scheduler.start_task(
|
|
93
|
+
call_llm_api(f"Task {i}"),
|
|
94
|
+
estimated_tokens=500
|
|
95
|
+
)
|
|
96
|
+
tasks.append(t)
|
|
97
|
+
|
|
98
|
+
results = await asyncio.gather(*tasks)
|
|
99
|
+
print(f"XtremeFlow: Successfully processed {len(results)} tasks at peak throughput.")
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
asyncio.run(main())
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### 🔥 Performance Tools
|
|
106
|
+
|
|
107
|
+
Beyond rate limiting, XtremeFlow provides utilities to maximize token efficiency and throughput.
|
|
108
|
+
|
|
109
|
+
**KV Cache Optimization** (`kv_batch`)
|
|
110
|
+
```python
|
|
111
|
+
from xtremeflow.kvbatch import kv_batch
|
|
112
|
+
|
|
113
|
+
# First request establishes KV cache, rest run in parallel
|
|
114
|
+
task = kv_batch(
|
|
115
|
+
llm_score(prompt) for prompt in same_job_with_different_resumes
|
|
116
|
+
)
|
|
117
|
+
results = await task
|
|
118
|
+
```
|
|
119
|
+
Reduces token consumption by 40-60% for batched requests with shared prefixes.
|
|
120
|
+
|
|
121
|
+
**Async Pipeline** (`async_pipeline`)
|
|
122
|
+
```python
|
|
123
|
+
from xtremeflow.pipeline import async_pipeline
|
|
124
|
+
|
|
125
|
+
# Producer: scheduler-controlled, exhausts this tier's rate limit
|
|
126
|
+
async def producer(queue: asyncio.Queue):
|
|
127
|
+
async for item in source:
|
|
128
|
+
task = await scheduler.start_task(llm_api(item), estimate_tokens)
|
|
129
|
+
await queue.put(task)
|
|
130
|
+
|
|
131
|
+
# Processor: slower sequential processing, yields to next tier
|
|
132
|
+
async def process_item(item):
|
|
133
|
+
result = await item
|
|
134
|
+
return await db_write(result) # Different rate limit tier
|
|
135
|
+
|
|
136
|
+
async for result in async_pipeline(producer, process_item):
|
|
137
|
+
yield result # Can chain to another tier
|
|
138
|
+
```
|
|
139
|
+
Decouples rate limit tiers—exhausting each tier's limit frees up quota for other tasks immediately, maximizing overall system throughput.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
xtremeflow/__init__.py,sha256=nSaIwlYraiF6jrhcHU5rDDa7c_PXYk-TGEJ85NRK1xM,109
|
|
2
|
+
xtremeflow/kvbatch.py,sha256=F_fhRn4p3t4yXDbNAu-_GlnbjwqZCC6LFgZdygTn2Rw,2173
|
|
3
|
+
xtremeflow/pipeline.py,sha256=RIblYq9cmjbUPnq3KbxliMx34Qj6BAvrdRV2H-tApWs,1015
|
|
4
|
+
xtremeflow/scheduler/__init__.py,sha256=zFFT4IWQfZtnt1bGwbAITv4W8jMK96W3UrSk7-EV10U,68
|
|
5
|
+
xtremeflow/scheduler/base.py,sha256=nfo4B90VcEem55AyF21kXQA8TUgiUtIOKoUXTPSpSiA,1148
|
|
6
|
+
xtremeflow/scheduler/rate_limit.py,sha256=f9L_eCXC2Ess9vl8n90PY5_i4yu7zjiNOR9trjhCov4,3711
|
|
7
|
+
xtremeflow/scheduler/request.py,sha256=OHR3WcsxgTTsvDcB8rkxeldfwUK57UZa3f5OtTy-T1U,1662
|
|
8
|
+
xtremeflow/scheduler/token.py,sha256=4qWvOZvWNy3Nd4tiEuCCy_9kigK1rVvvdls0EQKetI8,2822
|
|
9
|
+
xtremeflow-0.1.0.dist-info/licenses/LICENSE,sha256=OhxWdg6CEDY67lRrpbPvRuKrxRpcZEL4S84Ip25MFGk,1067
|
|
10
|
+
xtremeflow-0.1.0.dist-info/METADATA,sha256=tMngmtrwxoafokwDSyBpuGmWNb82-m3kNktd2WOEQ90,5969
|
|
11
|
+
xtremeflow-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
12
|
+
xtremeflow-0.1.0.dist-info/top_level.txt,sha256=eBfGcIsAFKFX4AKTq_ELDu8YoZhhCJWv3ijTjFBiVWs,11
|
|
13
|
+
xtremeflow-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Flow Jiang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
xtremeflow
|