kubefn 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kubefn-0.5.0/PKG-INFO +21 -0
- kubefn-0.5.0/kubefn/__init__.py +23 -0
- kubefn-0.5.0/kubefn/__main__.py +110 -0
- kubefn-0.5.0/kubefn/circuit_breaker.py +166 -0
- kubefn-0.5.0/kubefn/context.py +51 -0
- kubefn-0.5.0/kubefn/decorators.py +69 -0
- kubefn-0.5.0/kubefn/drain_manager.py +115 -0
- kubefn-0.5.0/kubefn/heap_exchange.py +286 -0
- kubefn-0.5.0/kubefn/heap_guard.py +188 -0
- kubefn-0.5.0/kubefn/introspection.py +313 -0
- kubefn-0.5.0/kubefn/loader.py +137 -0
- kubefn-0.5.0/kubefn/metrics.py +198 -0
- kubefn-0.5.0/kubefn/request_timeout.py +86 -0
- kubefn-0.5.0/kubefn/scheduler.py +311 -0
- kubefn-0.5.0/kubefn/server.py +555 -0
- kubefn-0.5.0/kubefn.egg-info/PKG-INFO +21 -0
- kubefn-0.5.0/kubefn.egg-info/SOURCES.txt +21 -0
- kubefn-0.5.0/kubefn.egg-info/dependency_links.txt +1 -0
- kubefn-0.5.0/kubefn.egg-info/entry_points.txt +2 -0
- kubefn-0.5.0/kubefn.egg-info/top_level.txt +1 -0
- kubefn-0.5.0/pyproject.toml +32 -0
- kubefn-0.5.0/setup.cfg +4 -0
- kubefn-0.5.0/tests/test_heap_exchange.py +91 -0
kubefn-0.5.0/PKG-INFO
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kubefn
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: KubeFn Python Runtime — Live Application Fabric for Memory-Continuous Architecture
|
|
5
|
+
Author-email: Pranab Sarkar <developer@pranab.co.in>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://kubefn.com
|
|
8
|
+
Project-URL: GitHub, https://github.com/kubefn/kubefn
|
|
9
|
+
Project-URL: Documentation, https://kubefn.com
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/kubefn/kubefn/issues
|
|
11
|
+
Keywords: kubefn,faas,functions,shared-memory,zero-copy,kubernetes,heap-exchange
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
19
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
KubeFn Python Runtime — Memory-Continuous Architecture for Python.
|
|
3
|
+
|
|
4
|
+
Same concept as the JVM runtime: multiple independently deployable
|
|
5
|
+
functions share a single Python interpreter, exchanging objects
|
|
6
|
+
in shared memory with zero serialization.
|
|
7
|
+
|
|
8
|
+
from kubefn import function, heap_exchange, KubeFnApp
|
|
9
|
+
|
|
10
|
+
@function("/predict", methods=["POST"], group="ml-pipeline")
|
|
11
|
+
def predict(request, ctx):
|
|
12
|
+
features = ctx.heap.get("features") # Zero-copy from shared heap
|
|
13
|
+
prediction = model.predict(features)
|
|
14
|
+
ctx.heap.publish("prediction", prediction)
|
|
15
|
+
return {"score": prediction}
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
__version__ = "0.4.0"
|
|
19
|
+
|
|
20
|
+
from .decorators import function
|
|
21
|
+
from .scheduler import schedule
|
|
22
|
+
from .heap_exchange import HeapExchange
|
|
23
|
+
from .context import FnContext, FnRequest
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""
|
|
2
|
+
KubeFn Python Runtime — entry point.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python -m kubefn
|
|
6
|
+
python -m kubefn --port 8080 --functions-dir /var/kubefn/functions
|
|
7
|
+
python -m kubefn --timeout 60000 --no-scheduler
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
from .circuit_breaker import CircuitBreakerRegistry
|
|
14
|
+
from .drain_manager import DrainManager
|
|
15
|
+
from .introspection import CaptureEngine
|
|
16
|
+
from .metrics import MetricsRecorder
|
|
17
|
+
from .heap_guard import HeapGuard, HeapGuardConfig
|
|
18
|
+
from .scheduler import SchedulerEngine
|
|
19
|
+
from .server import run_server
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def main():
|
|
23
|
+
parser = argparse.ArgumentParser(description="KubeFn Python Runtime")
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--port", type=int,
|
|
26
|
+
default=int(os.environ.get("KUBEFN_PORT", "8080")),
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--functions-dir",
|
|
30
|
+
default=os.environ.get("KUBEFN_FUNCTIONS_DIR", "/var/kubefn/functions"),
|
|
31
|
+
)
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--timeout", type=int,
|
|
34
|
+
default=int(os.environ.get("KUBEFN_TIMEOUT_MS", "30000")),
|
|
35
|
+
help="Request timeout in milliseconds (default: 30000)",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--max-heap-objects", type=int,
|
|
39
|
+
default=int(os.environ.get("KUBEFN_MAX_HEAP_OBJECTS", "10000")),
|
|
40
|
+
help="Maximum objects in HeapExchange (default: 10000)",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--cb-threshold", type=int,
|
|
44
|
+
default=int(os.environ.get("KUBEFN_CB_THRESHOLD", "5")),
|
|
45
|
+
help="Circuit breaker failure threshold (default: 5)",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--cb-reset-timeout", type=float,
|
|
49
|
+
default=float(os.environ.get("KUBEFN_CB_RESET_TIMEOUT", "30.0")),
|
|
50
|
+
help="Circuit breaker reset timeout in seconds (default: 30)",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--drain-timeout", type=float,
|
|
54
|
+
default=float(os.environ.get("KUBEFN_DRAIN_TIMEOUT", "30.0")),
|
|
55
|
+
help="Drain timeout in seconds (default: 30)",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--event-ring-capacity", type=int,
|
|
59
|
+
default=int(os.environ.get("KUBEFN_EVENT_RING_CAPACITY", "50000")),
|
|
60
|
+
help="Causal event ring buffer capacity (default: 50000)",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--no-scheduler", action="store_true",
|
|
64
|
+
default=os.environ.get("KUBEFN_NO_SCHEDULER", "").lower() in ("1", "true"),
|
|
65
|
+
help="Disable the cron scheduler",
|
|
66
|
+
)
|
|
67
|
+
args = parser.parse_args()
|
|
68
|
+
|
|
69
|
+
# ── Create production subsystems ──────────────────────────────────
|
|
70
|
+
|
|
71
|
+
heap_guard = HeapGuard(HeapGuardConfig(
|
|
72
|
+
max_objects=args.max_heap_objects,
|
|
73
|
+
))
|
|
74
|
+
|
|
75
|
+
circuit_breakers = CircuitBreakerRegistry(
|
|
76
|
+
failure_threshold=args.cb_threshold,
|
|
77
|
+
reset_timeout_s=args.cb_reset_timeout,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
drain_manager = DrainManager(
|
|
81
|
+
drain_timeout_s=args.drain_timeout,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
capture_engine = CaptureEngine(
|
|
85
|
+
capacity=args.event_ring_capacity,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
metrics_recorder = MetricsRecorder()
|
|
89
|
+
|
|
90
|
+
scheduler: SchedulerEngine | None = None
|
|
91
|
+
if not args.no_scheduler:
|
|
92
|
+
scheduler = SchedulerEngine()
|
|
93
|
+
|
|
94
|
+
# ── Start server with all subsystems ──────────────────────────────
|
|
95
|
+
|
|
96
|
+
run_server(
|
|
97
|
+
port=args.port,
|
|
98
|
+
functions_dir=args.functions_dir,
|
|
99
|
+
heap_guard=heap_guard,
|
|
100
|
+
circuit_breakers=circuit_breakers,
|
|
101
|
+
drain_manager=drain_manager,
|
|
102
|
+
capture_engine=capture_engine,
|
|
103
|
+
metrics_recorder=metrics_recorder,
|
|
104
|
+
scheduler=scheduler,
|
|
105
|
+
request_timeout_ms=args.timeout,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
main()
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Circuit Breaker — per-function fault isolation.
|
|
3
|
+
|
|
4
|
+
Prevents cascading failures by tracking error rates per function
|
|
5
|
+
and temporarily disabling functions that exceed the failure threshold.
|
|
6
|
+
|
|
7
|
+
States:
|
|
8
|
+
CLOSED → normal operation, requests pass through
|
|
9
|
+
OPEN → function disabled, requests fail-fast
|
|
10
|
+
HALF_OPEN → trial period, one request allowed to test recovery
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import enum
|
|
14
|
+
import logging
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("kubefn.circuit_breaker")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CircuitState(enum.Enum):
|
|
23
|
+
CLOSED = "CLOSED"
|
|
24
|
+
OPEN = "OPEN"
|
|
25
|
+
HALF_OPEN = "HALF_OPEN"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class CircuitBreaker:
|
|
30
|
+
"""Per-function circuit breaker with configurable thresholds."""
|
|
31
|
+
|
|
32
|
+
function_name: str
|
|
33
|
+
failure_threshold: int = 5
|
|
34
|
+
reset_timeout_s: float = 30.0
|
|
35
|
+
half_open_max_calls: int = 1
|
|
36
|
+
|
|
37
|
+
# Internal state
|
|
38
|
+
state: CircuitState = field(default=CircuitState.CLOSED)
|
|
39
|
+
failure_count: int = field(default=0)
|
|
40
|
+
success_count: int = field(default=0)
|
|
41
|
+
last_failure_time: float = field(default=0.0)
|
|
42
|
+
last_state_change: float = field(default_factory=time.time)
|
|
43
|
+
total_rejections: int = field(default=0)
|
|
44
|
+
half_open_calls: int = field(default=0)
|
|
45
|
+
_lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
|
|
46
|
+
|
|
47
|
+
def is_allowed(self) -> bool:
|
|
48
|
+
"""Check if a request is allowed through this breaker."""
|
|
49
|
+
with self._lock:
|
|
50
|
+
match self.state:
|
|
51
|
+
case CircuitState.CLOSED:
|
|
52
|
+
return True
|
|
53
|
+
case CircuitState.OPEN:
|
|
54
|
+
if time.time() - self.last_failure_time >= self.reset_timeout_s:
|
|
55
|
+
self._transition(CircuitState.HALF_OPEN)
|
|
56
|
+
self.half_open_calls = 1
|
|
57
|
+
return True
|
|
58
|
+
self.total_rejections += 1
|
|
59
|
+
return False
|
|
60
|
+
case CircuitState.HALF_OPEN:
|
|
61
|
+
if self.half_open_calls < self.half_open_max_calls:
|
|
62
|
+
self.half_open_calls += 1
|
|
63
|
+
return True
|
|
64
|
+
self.total_rejections += 1
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
def record_success(self) -> None:
|
|
68
|
+
"""Record a successful invocation."""
|
|
69
|
+
with self._lock:
|
|
70
|
+
match self.state:
|
|
71
|
+
case CircuitState.HALF_OPEN:
|
|
72
|
+
self.success_count += 1
|
|
73
|
+
self._transition(CircuitState.CLOSED)
|
|
74
|
+
self.failure_count = 0
|
|
75
|
+
case CircuitState.CLOSED:
|
|
76
|
+
self.success_count += 1
|
|
77
|
+
self.failure_count = 0
|
|
78
|
+
|
|
79
|
+
def record_failure(self) -> None:
|
|
80
|
+
"""Record a failed invocation."""
|
|
81
|
+
with self._lock:
|
|
82
|
+
self.failure_count += 1
|
|
83
|
+
self.last_failure_time = time.time()
|
|
84
|
+
|
|
85
|
+
match self.state:
|
|
86
|
+
case CircuitState.HALF_OPEN:
|
|
87
|
+
self._transition(CircuitState.OPEN)
|
|
88
|
+
case CircuitState.CLOSED:
|
|
89
|
+
if self.failure_count >= self.failure_threshold:
|
|
90
|
+
self._transition(CircuitState.OPEN)
|
|
91
|
+
logger.warning(
|
|
92
|
+
f"Circuit OPEN for {self.function_name}: "
|
|
93
|
+
f"{self.failure_count} consecutive failures"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _transition(self, new_state: CircuitState) -> None:
|
|
97
|
+
old_state = self.state
|
|
98
|
+
self.state = new_state
|
|
99
|
+
self.last_state_change = time.time()
|
|
100
|
+
self.half_open_calls = 0
|
|
101
|
+
logger.info(
|
|
102
|
+
f"Circuit breaker [{self.function_name}]: "
|
|
103
|
+
f"{old_state.value} -> {new_state.value}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def status(self) -> dict:
|
|
107
|
+
with self._lock:
|
|
108
|
+
return {
|
|
109
|
+
"function": self.function_name,
|
|
110
|
+
"state": self.state.value,
|
|
111
|
+
"failureCount": self.failure_count,
|
|
112
|
+
"successCount": self.success_count,
|
|
113
|
+
"totalRejections": self.total_rejections,
|
|
114
|
+
"lastFailureTime": self.last_failure_time,
|
|
115
|
+
"lastStateChange": self.last_state_change,
|
|
116
|
+
"failureThreshold": self.failure_threshold,
|
|
117
|
+
"resetTimeoutS": self.reset_timeout_s,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class CircuitBreakerRegistry:
|
|
122
|
+
"""
|
|
123
|
+
Manages per-function circuit breakers. Automatically creates
|
|
124
|
+
a breaker on first access for any function name.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
failure_threshold: int = 5,
|
|
130
|
+
reset_timeout_s: float = 30.0,
|
|
131
|
+
):
|
|
132
|
+
self._breakers: dict[str, CircuitBreaker] = {}
|
|
133
|
+
self._lock = threading.Lock()
|
|
134
|
+
self._failure_threshold = failure_threshold
|
|
135
|
+
self._reset_timeout_s = reset_timeout_s
|
|
136
|
+
|
|
137
|
+
def _get_or_create(self, func_name: str) -> CircuitBreaker:
|
|
138
|
+
breaker = self._breakers.get(func_name)
|
|
139
|
+
if breaker is not None:
|
|
140
|
+
return breaker
|
|
141
|
+
with self._lock:
|
|
142
|
+
# Double-check after acquiring lock
|
|
143
|
+
if func_name not in self._breakers:
|
|
144
|
+
self._breakers[func_name] = CircuitBreaker(
|
|
145
|
+
function_name=func_name,
|
|
146
|
+
failure_threshold=self._failure_threshold,
|
|
147
|
+
reset_timeout_s=self._reset_timeout_s,
|
|
148
|
+
)
|
|
149
|
+
return self._breakers[func_name]
|
|
150
|
+
|
|
151
|
+
def is_allowed(self, func_name: str) -> bool:
|
|
152
|
+
return self._get_or_create(func_name).is_allowed()
|
|
153
|
+
|
|
154
|
+
def record_success(self, func_name: str) -> None:
|
|
155
|
+
self._get_or_create(func_name).record_success()
|
|
156
|
+
|
|
157
|
+
def record_failure(self, func_name: str) -> None:
|
|
158
|
+
self._get_or_create(func_name).record_failure()
|
|
159
|
+
|
|
160
|
+
def get_status(self) -> dict:
|
|
161
|
+
"""Return status of all known breakers."""
|
|
162
|
+
with self._lock:
|
|
163
|
+
return {
|
|
164
|
+
name: breaker.status()
|
|
165
|
+
for name, breaker in sorted(self._breakers.items())
|
|
166
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Function execution context — the function's window into the organism.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
from .heap_exchange import HeapExchange
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class FnRequest:
|
|
14
|
+
"""Incoming HTTP request."""
|
|
15
|
+
method: str
|
|
16
|
+
path: str
|
|
17
|
+
headers: dict[str, str]
|
|
18
|
+
query_params: dict[str, str]
|
|
19
|
+
body: bytes
|
|
20
|
+
body_text: str = ""
|
|
21
|
+
|
|
22
|
+
def query_param(self, name: str, default: str = None) -> Optional[str]:
|
|
23
|
+
return self.query_params.get(name, default)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class FnContext:
|
|
28
|
+
"""
|
|
29
|
+
The function's window into the living organism.
|
|
30
|
+
Provides access to HeapExchange, cache, logging, config.
|
|
31
|
+
"""
|
|
32
|
+
heap: HeapExchange
|
|
33
|
+
group_name: str
|
|
34
|
+
function_name: str
|
|
35
|
+
revision_id: str
|
|
36
|
+
config: dict[str, str]
|
|
37
|
+
_cache: dict = None
|
|
38
|
+
|
|
39
|
+
def __post_init__(self):
|
|
40
|
+
if self._cache is None:
|
|
41
|
+
self._cache = {}
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def logger(self) -> logging.Logger:
|
|
45
|
+
return logging.getLogger(f"kubefn.{self.group_name}.{self.function_name}")
|
|
46
|
+
|
|
47
|
+
def cache_get(self, key: str) -> Optional[Any]:
|
|
48
|
+
return self._cache.get(key)
|
|
49
|
+
|
|
50
|
+
def cache_put(self, key: str, value: Any):
|
|
51
|
+
self._cache[key] = value
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Function decorators for KubeFn Python functions.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from kubefn.decorators import function
|
|
6
|
+
|
|
7
|
+
@function("/predict", methods=["POST"], group="ml-pipeline")
|
|
8
|
+
def predict(request, ctx):
|
|
9
|
+
return {"prediction": 0.95}
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import functools
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class FunctionMetadata:
|
|
18
|
+
"""Metadata extracted from the @function decorator."""
|
|
19
|
+
path: str
|
|
20
|
+
methods: list[str]
|
|
21
|
+
group: str
|
|
22
|
+
name: str
|
|
23
|
+
handler: callable
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Global registry of decorated functions
|
|
27
|
+
_registry: list[FunctionMetadata] = []
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def function(path: str, methods: list[str] = None, group: str = "default"):
|
|
31
|
+
"""
|
|
32
|
+
Decorator that registers a Python function as a KubeFn handler.
|
|
33
|
+
|
|
34
|
+
@function("/predict", methods=["POST"], group="ml-pipeline")
|
|
35
|
+
def predict(request, ctx):
|
|
36
|
+
features = ctx.heap.get("features")
|
|
37
|
+
return {"prediction": model.predict(features)}
|
|
38
|
+
"""
|
|
39
|
+
if methods is None:
|
|
40
|
+
methods = ["GET", "POST"]
|
|
41
|
+
|
|
42
|
+
def decorator(func):
|
|
43
|
+
metadata = FunctionMetadata(
|
|
44
|
+
path=path,
|
|
45
|
+
methods=methods,
|
|
46
|
+
group=group,
|
|
47
|
+
name=func.__name__,
|
|
48
|
+
handler=func,
|
|
49
|
+
)
|
|
50
|
+
_registry.append(metadata)
|
|
51
|
+
|
|
52
|
+
@functools.wraps(func)
|
|
53
|
+
def wrapper(*args, **kwargs):
|
|
54
|
+
return func(*args, **kwargs)
|
|
55
|
+
|
|
56
|
+
wrapper._kubefn_metadata = metadata
|
|
57
|
+
return wrapper
|
|
58
|
+
|
|
59
|
+
return decorator
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_registered_functions() -> list[FunctionMetadata]:
|
|
63
|
+
"""Get all registered function handlers."""
|
|
64
|
+
return list(_registry)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def clear_registry():
|
|
68
|
+
"""Clear the function registry (used during hot-reload)."""
|
|
69
|
+
_registry.clear()
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Drain Manager — graceful shutdown and hot-swap coordination.
|
|
3
|
+
|
|
4
|
+
Tracks in-flight requests and supports draining: rejecting new requests
|
|
5
|
+
while allowing in-flight requests to complete before shutdown or reload.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("kubefn.drain")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DrainManager:
|
|
16
|
+
"""
|
|
17
|
+
Thread-safe drain coordinator. Used during graceful shutdown
|
|
18
|
+
and function hot-swap to ensure zero dropped requests.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
if not drain.acquire():
|
|
22
|
+
return 503 # draining, reject
|
|
23
|
+
try:
|
|
24
|
+
handle_request()
|
|
25
|
+
finally:
|
|
26
|
+
drain.release()
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, drain_timeout_s: float = 30.0):
|
|
30
|
+
self._lock = threading.Lock()
|
|
31
|
+
self._drained = threading.Event()
|
|
32
|
+
self._draining = False
|
|
33
|
+
self._in_flight = 0
|
|
34
|
+
self._drain_timeout_s = drain_timeout_s
|
|
35
|
+
self._drain_started_at: float | None = None
|
|
36
|
+
|
|
37
|
+
def acquire(self) -> bool:
|
|
38
|
+
"""
|
|
39
|
+
Attempt to acquire a request slot.
|
|
40
|
+
Returns False if the system is draining (caller should return 503).
|
|
41
|
+
"""
|
|
42
|
+
with self._lock:
|
|
43
|
+
if self._draining:
|
|
44
|
+
return False
|
|
45
|
+
self._in_flight += 1
|
|
46
|
+
self._drained.clear()
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
def release(self) -> None:
|
|
50
|
+
"""Release a request slot after handling completes."""
|
|
51
|
+
with self._lock:
|
|
52
|
+
self._in_flight -= 1
|
|
53
|
+
if self._in_flight <= 0:
|
|
54
|
+
self._in_flight = 0
|
|
55
|
+
self._drained.set()
|
|
56
|
+
|
|
57
|
+
def start_drain(self, timeout_s: float | None = None) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Begin draining. Blocks until all in-flight requests complete
|
|
60
|
+
or the timeout expires.
|
|
61
|
+
|
|
62
|
+
Returns True if drain completed (in-flight reached 0),
|
|
63
|
+
False if timed out.
|
|
64
|
+
"""
|
|
65
|
+
timeout = timeout_s if timeout_s is not None else self._drain_timeout_s
|
|
66
|
+
|
|
67
|
+
with self._lock:
|
|
68
|
+
self._draining = True
|
|
69
|
+
self._drain_started_at = time.time()
|
|
70
|
+
if self._in_flight == 0:
|
|
71
|
+
self._drained.set()
|
|
72
|
+
logger.info("Drain: no in-flight requests, drained immediately")
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
logger.info(
|
|
76
|
+
f"Drain started: {self._in_flight} in-flight requests, "
|
|
77
|
+
f"timeout={timeout}s"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
drained = self._drained.wait(timeout=timeout)
|
|
81
|
+
|
|
82
|
+
if drained:
|
|
83
|
+
logger.info("Drain complete: all in-flight requests finished")
|
|
84
|
+
else:
|
|
85
|
+
logger.warning(
|
|
86
|
+
f"Drain timed out after {timeout}s with "
|
|
87
|
+
f"{self._in_flight} requests still in-flight"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return drained
|
|
91
|
+
|
|
92
|
+
def cancel_drain(self) -> None:
|
|
93
|
+
"""Cancel an active drain (e.g. if reload was aborted)."""
|
|
94
|
+
with self._lock:
|
|
95
|
+
self._draining = False
|
|
96
|
+
self._drain_started_at = None
|
|
97
|
+
logger.info("Drain cancelled")
|
|
98
|
+
|
|
99
|
+
def is_draining(self) -> bool:
|
|
100
|
+
return self._draining
|
|
101
|
+
|
|
102
|
+
def in_flight_count(self) -> int:
|
|
103
|
+
return self._in_flight
|
|
104
|
+
|
|
105
|
+
def status(self) -> dict:
|
|
106
|
+
with self._lock:
|
|
107
|
+
result: dict = {
|
|
108
|
+
"draining": self._draining,
|
|
109
|
+
"inFlight": self._in_flight,
|
|
110
|
+
"drainTimeoutS": self._drain_timeout_s,
|
|
111
|
+
}
|
|
112
|
+
if self._drain_started_at is not None:
|
|
113
|
+
result["drainStartedAt"] = self._drain_started_at
|
|
114
|
+
result["drainElapsedS"] = round(time.time() - self._drain_started_at, 2)
|
|
115
|
+
return result
|