llro 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llro-1.0.0.dist-info/METADATA +262 -0
- llro-1.0.0.dist-info/RECORD +8 -0
- llro-1.0.0.dist-info/WHEEL +5 -0
- llro-1.0.0.dist-info/entry_points.txt +3 -0
- llro-1.0.0.dist-info/licenses/LICENSE +21 -0
- llro-1.0.0.dist-info/top_level.txt +2 -0
- llro.py +720 -0
- llro_cli.py +159 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llro
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Service to get lowest latency route to targets and set static routes.
|
|
5
|
+
Author: LLRO contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/greensec/LowestLatencyRoutesOptimizer
|
|
8
|
+
Project-URL: Repository, https://github.com/greensec/LowestLatencyRoutesOptimizer
|
|
9
|
+
Project-URL: Issues, https://github.com/greensec/LowestLatencyRoutesOptimizer/issues
|
|
10
|
+
Keywords: network,routing,latency,icmp
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
14
|
+
Classifier: Topic :: System :: Networking
|
|
15
|
+
Requires-Python: >=3.7
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: icmplib==3.0.4
|
|
19
|
+
Requires-Dist: PyYAML==6.0.1; python_version < "3.8"
|
|
20
|
+
Requires-Dist: PyYAML==6.0.3; python_version >= "3.8"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# LLRO (Lowest Latency Routes Optimizer)
|
|
24
|
+
|
|
25
|
+
Service to measure ICMP latency from multiple uplinks and keep per-host `/32` routes pinned to the best path.
|
|
26
|
+
|
|
27
|
+
LLRO continuously probes each monitored destination through each configured uplink source, compares packet loss and latency, and installs host routes using Linux `ip route` so traffic for that destination follows the healthiest path. It can freeze or override routing decisions per host through a local admin socket, and it can fall back to predefined routes when probes fail. In short, it automates per-destination path selection based on live network conditions instead of static one-time routing choices.
|
|
28
|
+
|
|
29
|
+
For a deeper technical walkthrough, see [HOW_IT_WORKS.md](/home/stefan/github/LowestLatencyRoutesOptimizer/HOW_IT_WORKS.md).
|
|
30
|
+
|
|
31
|
+
## Runtime requirements
|
|
32
|
+
|
|
33
|
+
- Linux with `iproute2` (`ip` command available, default path `/usr/sbin/ip`)
|
|
34
|
+
- Root privileges or equivalent capabilities (`CAP_NET_ADMIN` and raw ICMP capability)
|
|
35
|
+
- Python `>=3.7`
|
|
36
|
+
|
|
37
|
+
## Development setup
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv sync
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Run locally
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
uv run llro --config ./config.yml
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Configuration (recommended model)
|
|
50
|
+
|
|
51
|
+
Start from the example:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
cp config.example.yml config.yml
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
|
|
59
|
+
```yaml
|
|
60
|
+
monitor:
|
|
61
|
+
- 1.1.1.1
|
|
62
|
+
- 8.8.8.8
|
|
63
|
+
|
|
64
|
+
routes:
|
|
65
|
+
- name: wan_fiber
|
|
66
|
+
device: eth0
|
|
67
|
+
probe_source: 192.168.0.8
|
|
68
|
+
gateway: 192.168.0.1
|
|
69
|
+
- name: wan_lte
|
|
70
|
+
device: wwan0
|
|
71
|
+
probe_source: 10.0.0.2
|
|
72
|
+
gateway: 10.0.0.1
|
|
73
|
+
|
|
74
|
+
also_route:
|
|
75
|
+
1.1.1.1:
|
|
76
|
+
- 1.0.0.1
|
|
77
|
+
8.8.8.8:
|
|
78
|
+
- 8.8.4.4
|
|
79
|
+
|
|
80
|
+
fallback_routes:
|
|
81
|
+
1.1.1.1: wan_fiber
|
|
82
|
+
8.8.8.8: wan_lte
|
|
83
|
+
|
|
84
|
+
rtt_threshold: 20
|
|
85
|
+
packet_loss_threshold: 2
|
|
86
|
+
test_count: 5
|
|
87
|
+
test_interval: 1
|
|
88
|
+
scan_interval: 30
|
|
89
|
+
delete_preadded_routes: true
|
|
90
|
+
# ip_bin: /usr/sbin/ip
|
|
91
|
+
# admin_socket_path: /run/llro/admin.sock
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Key fields
|
|
95
|
+
|
|
96
|
+
- `monitor`: host IPs to probe and route.
|
|
97
|
+
- `routes`: route candidates.
|
|
98
|
+
- `routes[].name`: unique route identifier.
|
|
99
|
+
- `routes[].device`: network device used for route installation.
|
|
100
|
+
- `routes[].probe_source`: source IP used for probing and route `src`.
|
|
101
|
+
- `routes[].gateway`: next-hop gateway for the host route.
|
|
102
|
+
- `also_route`: optional extra IPs that should follow a monitored host route.
|
|
103
|
+
- `fallback_routes`: optional fallback route name per monitored host.
|
|
104
|
+
- `rtt_threshold`: minimum RTT improvement (ms) required before switching.
|
|
105
|
+
- `packet_loss_threshold`: packet-loss threshold (%) that can force switching.
|
|
106
|
+
- `test_count`: number of probe rounds aggregated before routing decisions.
|
|
107
|
+
- `test_interval`: interval between ping packets in a probe run.
|
|
108
|
+
- `scan_interval`: delay between scan cycles.
|
|
109
|
+
- `delete_preadded_routes`: remove existing static `/32` routes for monitored hosts on startup.
|
|
110
|
+
- `ip_bin`: optional `ip` binary path override.
|
|
111
|
+
- `admin_socket_path`: Unix socket path used by `llro-cli` for admin/monitoring.
|
|
112
|
+
|
|
113
|
+
## Legacy config compatibility
|
|
114
|
+
|
|
115
|
+
The old `interfaces` model is still accepted for now:
|
|
116
|
+
|
|
117
|
+
```yaml
|
|
118
|
+
interfaces:
|
|
119
|
+
eth0:
|
|
120
|
+
- 192.168.0.8
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Compatibility mode maps each `interfaces.<device>.<source>` entry to a generated route candidate:
|
|
124
|
+
|
|
125
|
+
- `name: "<device>:<source>"`
|
|
126
|
+
- `probe_source: <source>`
|
|
127
|
+
- `gateway: <source>` (legacy behavior)
|
|
128
|
+
|
|
129
|
+
`fallback_routes` may reference either route names (new) or legacy source IPs (old).
|
|
130
|
+
|
|
131
|
+
## Tooling (Make + uv)
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
make validate # format check + lint + typecheck + dead-code scan
|
|
135
|
+
make test # pytest
|
|
136
|
+
make integration-test # dockerized route mutation integration test
|
|
137
|
+
make build # build sdist/wheel + twine metadata check
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Auto-fix formatting/lint issues:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
make fix
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Run integration tests directly:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
RUN_DOCKER_INTEGRATION=1 uv run pytest -m integration
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The compose integration scenario spins up multiple containers, blocks ICMP on one path, and verifies LLRO switches the monitored host route to the remaining healthy path.
|
|
153
|
+
|
|
154
|
+
## Install as CLI
|
|
155
|
+
|
|
156
|
+
From local checkout:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
uv pip install .
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
From wheel:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
uv pip install dist/*.whl
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Then run:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
llro --config /etc/llro.yml
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Admin commands (against running daemon):
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
llro-cli status
|
|
178
|
+
llro-cli override --host 1.1.1.1 --route wan_fiber
|
|
179
|
+
llro-cli disable-switching --all
|
|
180
|
+
llro-cli reset-auto --host 1.1.1.1
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Example output:
|
|
184
|
+
|
|
185
|
+
```text
|
|
186
|
+
$ llro-cli status
|
|
187
|
+
Host 1.1.1.1 | mode=auto | switching=yes | current=wan_fiber | override=-
|
|
188
|
+
wan_fiber: rtt=14.2 ms, loss=0%, alive=yes
|
|
189
|
+
wan_lte: rtt=35.8 ms, loss=0%, alive=yes
|
|
190
|
+
Host 8.8.8.8 | mode=frozen | switching=no | current=wan_lte | override=-
|
|
191
|
+
wan_fiber: rtt=48.1 ms, loss=0%, alive=yes
|
|
192
|
+
wan_lte: rtt=31.6 ms, loss=0%, alive=yes
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
```text
|
|
196
|
+
$ llro-cli status --json
|
|
197
|
+
{
|
|
198
|
+
"hosts": [
|
|
199
|
+
{
|
|
200
|
+
"current_route": "wan_fiber",
|
|
201
|
+
"host": "1.1.1.1",
|
|
202
|
+
"mode": "auto",
|
|
203
|
+
"override_route": null,
|
|
204
|
+
"routes": {
|
|
205
|
+
"wan_fiber": {
|
|
206
|
+
"avg_loss": 0,
|
|
207
|
+
"avg_rtt": 14.2,
|
|
208
|
+
"is_alive": true
|
|
209
|
+
},
|
|
210
|
+
"wan_lte": {
|
|
211
|
+
"avg_loss": 0,
|
|
212
|
+
"avg_rtt": 35.8,
|
|
213
|
+
"is_alive": true
|
|
214
|
+
}
|
|
215
|
+
},
|
|
216
|
+
"switching_enabled": true
|
|
217
|
+
}
|
|
218
|
+
]
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
```text
|
|
223
|
+
$ llro-cli override --host 1.1.1.1 --route wan_lte
|
|
224
|
+
{"host": "1.1.1.1", "mode": "override", "route": "wan_lte"}
|
|
225
|
+
|
|
226
|
+
$ llro-cli disable-switching --all
|
|
227
|
+
{"hosts": ["1.1.1.1", "8.8.8.8"], "mode": "frozen"}
|
|
228
|
+
|
|
229
|
+
$ llro-cli reset-auto --host 1.1.1.1
|
|
230
|
+
{"hosts": ["1.1.1.1"], "mode": "auto"}
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## systemd service
|
|
234
|
+
|
|
235
|
+
Use the provided unit as a base and verify the executable path in your environment:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
which llro
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Install:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
sudo cp llro.service /etc/systemd/system/llro.service
|
|
245
|
+
sudo systemctl daemon-reload
|
|
246
|
+
sudo systemctl enable --now llro
|
|
247
|
+
sudo systemctl status llro
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## PyPI release flow
|
|
251
|
+
|
|
252
|
+
- Local dry-run build: `make build`
|
|
253
|
+
- Publish manually with Twine:
|
|
254
|
+
- `make publish-testpypi`
|
|
255
|
+
- `make publish-pypi`
|
|
256
|
+
- GitHub Actions publish:
|
|
257
|
+
- Push tag `v*` to trigger `.github/workflows/publish-to-pypi.yml`
|
|
258
|
+
- Workflow uses trusted publishing (`id-token`) for PyPI
|
|
259
|
+
|
|
260
|
+
## Contributing
|
|
261
|
+
|
|
262
|
+
Inspired by <https://malaty.net/linux-lowest-latency-routes-optimizer/>
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
llro.py,sha256=kuY0tk0sUVbc7591jGnIw8UyWEeVkrEurm1_mMjMsSU,28372
|
|
2
|
+
llro_cli.py,sha256=qOr1C-owFKvi92wcZX5J69YQab7ycMKKlNXBdySlgeI,5926
|
|
3
|
+
llro-1.0.0.dist-info/licenses/LICENSE,sha256=KmD-Xnve5MfiDuucmOAwZ7mvDTcOdsT5OVpiVkKCINU,1072
|
|
4
|
+
llro-1.0.0.dist-info/METADATA,sha256=VqxguI7wCYAzL60wH_RWiGwkHyVti8KvQPuHXLjHz8s,6861
|
|
5
|
+
llro-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
llro-1.0.0.dist-info/entry_points.txt,sha256=9A9NRiTt0qwNN68CYGvB7Ecvqn4EqhV36Kc5167y7E8,60
|
|
7
|
+
llro-1.0.0.dist-info/top_level.txt,sha256=SXgObb8zdx3VlWbQcKfjSFscYrzHHye3rpNhjjuVCuA,14
|
|
8
|
+
llro-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Stefan Meinecke
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
llro.py
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import shlex
|
|
8
|
+
import signal
|
|
9
|
+
import stat
|
|
10
|
+
import subprocess
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
from icmplib import async_multiping
|
|
16
|
+
|
|
17
|
+
logging.basicConfig(
|
|
18
|
+
level=logging.INFO,
|
|
19
|
+
format="%(asctime)s %(levelname)-8s %(message)s",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
DEFAULT_ADMIN_SOCKET_PATH = "/run/llro/admin.sock"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ConfigError(ValueError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _as_float(value: Any, field_name: str) -> float:
|
|
30
|
+
try:
|
|
31
|
+
return float(value)
|
|
32
|
+
except (TypeError, ValueError):
|
|
33
|
+
raise ConfigError("%s must be a number" % field_name)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _as_int(value: Any, field_name: str) -> int:
|
|
37
|
+
try:
|
|
38
|
+
return int(value)
|
|
39
|
+
except (TypeError, ValueError):
|
|
40
|
+
raise ConfigError("%s must be an integer" % field_name)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _as_non_empty_string(value: Any, field_name: str) -> str:
|
|
44
|
+
if not isinstance(value, str) or not value.strip():
|
|
45
|
+
raise ConfigError("%s must be a non-empty string" % field_name)
|
|
46
|
+
return value.strip()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _normalize_monitor(config: Dict[str, Any]) -> List[str]:
|
|
50
|
+
monitor = config.get("monitor")
|
|
51
|
+
if not isinstance(monitor, list) or not monitor:
|
|
52
|
+
raise ConfigError("Config does not contain a non-empty monitor list")
|
|
53
|
+
return [_as_non_empty_string(item, "monitor entry") for item in monitor]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_also_route(config: Dict[str, Any]) -> Dict[str, List[str]]:
|
|
57
|
+
raw_also_route = config.get("also_route", {})
|
|
58
|
+
if not isinstance(raw_also_route, dict):
|
|
59
|
+
raise ConfigError("also_route must be a mapping")
|
|
60
|
+
|
|
61
|
+
normalized = {}
|
|
62
|
+
for host, mapped_hosts in raw_also_route.items():
|
|
63
|
+
key = _as_non_empty_string(host, "also_route key")
|
|
64
|
+
if not isinstance(mapped_hosts, list):
|
|
65
|
+
raise ConfigError("also_route values must be lists")
|
|
66
|
+
normalized[key] = [_as_non_empty_string(item, "also_route value") for item in mapped_hosts]
|
|
67
|
+
return normalized
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _normalize_routes(config: Dict[str, Any]) -> List[Dict[str, str]]:
|
|
71
|
+
raw_routes = config.get("routes")
|
|
72
|
+
routes = []
|
|
73
|
+
|
|
74
|
+
if raw_routes is not None:
|
|
75
|
+
if not isinstance(raw_routes, list) or not raw_routes:
|
|
76
|
+
raise ConfigError("routes must be a non-empty list")
|
|
77
|
+
for index, route in enumerate(raw_routes):
|
|
78
|
+
if not isinstance(route, dict):
|
|
79
|
+
raise ConfigError("routes[%s] must be a mapping" % index)
|
|
80
|
+
name = _as_non_empty_string(route.get("name"), "routes[%s].name" % index)
|
|
81
|
+
device = _as_non_empty_string(route.get("device"), "routes[%s].device" % index)
|
|
82
|
+
probe_source = _as_non_empty_string(route.get("probe_source"), "routes[%s].probe_source" % index)
|
|
83
|
+
gateway = _as_non_empty_string(route.get("gateway"), "routes[%s].gateway" % index)
|
|
84
|
+
routes.append(
|
|
85
|
+
{
|
|
86
|
+
"name": name,
|
|
87
|
+
"device": device,
|
|
88
|
+
"probe_source": probe_source,
|
|
89
|
+
"gateway": gateway,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
return routes
|
|
93
|
+
|
|
94
|
+
# Backward compatibility: old interfaces model.
|
|
95
|
+
raw_interfaces = config.get("interfaces")
|
|
96
|
+
if not isinstance(raw_interfaces, dict) or not raw_interfaces:
|
|
97
|
+
raise ConfigError("Config must contain either routes or interfaces")
|
|
98
|
+
|
|
99
|
+
for device, probe_sources in raw_interfaces.items():
|
|
100
|
+
dev_name = _as_non_empty_string(device, "interfaces key")
|
|
101
|
+
if not isinstance(probe_sources, list) or not probe_sources:
|
|
102
|
+
raise ConfigError("interfaces[%s] must be a non-empty list" % dev_name)
|
|
103
|
+
for probe_source in probe_sources:
|
|
104
|
+
src = _as_non_empty_string(probe_source, "interfaces[%s] source" % dev_name)
|
|
105
|
+
# Legacy behavior treated source and gateway as the same value.
|
|
106
|
+
routes.append(
|
|
107
|
+
{
|
|
108
|
+
"name": "%s:%s" % (dev_name, src),
|
|
109
|
+
"device": dev_name,
|
|
110
|
+
"probe_source": src,
|
|
111
|
+
"gateway": src,
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
return routes
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _normalize_fallback_routes(
|
|
118
|
+
raw_fallback_routes: Any, monitor: List[str], routes: List[Dict[str, str]]
|
|
119
|
+
) -> Dict[str, str]:
|
|
120
|
+
if raw_fallback_routes is None:
|
|
121
|
+
return {}
|
|
122
|
+
if not isinstance(raw_fallback_routes, dict):
|
|
123
|
+
raise ConfigError("fallback_routes must be a mapping")
|
|
124
|
+
|
|
125
|
+
route_names = set(route["name"] for route in routes)
|
|
126
|
+
by_probe_source = {}
|
|
127
|
+
by_gateway = {}
|
|
128
|
+
for route in routes:
|
|
129
|
+
by_probe_source[route["probe_source"]] = route["name"]
|
|
130
|
+
by_gateway.setdefault(route["gateway"], []).append(route["name"])
|
|
131
|
+
|
|
132
|
+
monitor_set = set(monitor)
|
|
133
|
+
normalized = {}
|
|
134
|
+
for host, route_ref in raw_fallback_routes.items():
|
|
135
|
+
host_key = _as_non_empty_string(host, "fallback_routes key")
|
|
136
|
+
if host_key not in monitor_set:
|
|
137
|
+
raise ConfigError("fallback_routes key '%s' must exist in monitor" % host_key)
|
|
138
|
+
ref = _as_non_empty_string(route_ref, "fallback_routes[%s]" % host_key)
|
|
139
|
+
|
|
140
|
+
if ref in route_names:
|
|
141
|
+
normalized[host_key] = ref
|
|
142
|
+
continue
|
|
143
|
+
if ref in by_probe_source:
|
|
144
|
+
normalized[host_key] = by_probe_source[ref]
|
|
145
|
+
continue
|
|
146
|
+
if ref in by_gateway and len(by_gateway[ref]) == 1:
|
|
147
|
+
normalized[host_key] = by_gateway[ref][0]
|
|
148
|
+
continue
|
|
149
|
+
raise ConfigError("fallback route '%s' for host '%s' does not match a configured route" % (ref, host_key))
|
|
150
|
+
return normalized
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def normalize_config(raw_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
154
|
+
if not isinstance(raw_config, dict):
|
|
155
|
+
raise ConfigError("Config must be a mapping")
|
|
156
|
+
|
|
157
|
+
monitor = _normalize_monitor(raw_config)
|
|
158
|
+
also_route = _normalize_also_route(raw_config)
|
|
159
|
+
routes = _normalize_routes(raw_config)
|
|
160
|
+
|
|
161
|
+
route_names = set()
|
|
162
|
+
for route in routes:
|
|
163
|
+
if route["name"] in route_names:
|
|
164
|
+
raise ConfigError("Duplicate route name '%s'" % route["name"])
|
|
165
|
+
route_names.add(route["name"])
|
|
166
|
+
|
|
167
|
+
test_count = _as_int(raw_config.get("test_count", 3), "test_count")
|
|
168
|
+
if test_count <= 0:
|
|
169
|
+
raise ConfigError("test_count must be greater than 0")
|
|
170
|
+
|
|
171
|
+
packet_loss_threshold = _as_float(
|
|
172
|
+
raw_config.get(
|
|
173
|
+
"packet_loss_threshold",
|
|
174
|
+
raw_config.get("paketloss_threshold", 5),
|
|
175
|
+
),
|
|
176
|
+
"packet_loss_threshold",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
normalized = {
|
|
180
|
+
"monitor": monitor,
|
|
181
|
+
"also_route": also_route,
|
|
182
|
+
"routes": routes,
|
|
183
|
+
"fallback_routes": _normalize_fallback_routes(raw_config.get("fallback_routes"), monitor, routes),
|
|
184
|
+
"rtt_threshold": _as_float(raw_config.get("rtt_threshold", 20), "rtt_threshold"),
|
|
185
|
+
"packet_loss_threshold": packet_loss_threshold,
|
|
186
|
+
# Keep legacy key to avoid breaking existing consumers/tests.
|
|
187
|
+
"paketloss_threshold": packet_loss_threshold,
|
|
188
|
+
"test_count": test_count,
|
|
189
|
+
"test_interval": _as_float(raw_config.get("test_interval", 0.5), "test_interval"),
|
|
190
|
+
"scan_interval": _as_float(raw_config.get("scan_interval", 10), "scan_interval"),
|
|
191
|
+
"delete_preadded_routes": bool(raw_config.get("delete_preadded_routes", False)),
|
|
192
|
+
"ip_bin": _as_non_empty_string(raw_config.get("ip_bin", "/usr/sbin/ip"), "ip_bin"),
|
|
193
|
+
"admin_socket_path": _as_non_empty_string(
|
|
194
|
+
raw_config.get("admin_socket_path", DEFAULT_ADMIN_SOCKET_PATH), "admin_socket_path"
|
|
195
|
+
),
|
|
196
|
+
"debug": bool(raw_config.get("debug", False)),
|
|
197
|
+
}
|
|
198
|
+
return normalized
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class LowestLatencyRoutesOptimizer:
|
|
202
|
+
def __init__(self, config: Dict[str, Any]):
|
|
203
|
+
self.config = normalize_config(config)
|
|
204
|
+
self.routes = self.config["routes"] # type: List[Dict[str, str]]
|
|
205
|
+
self.routes_by_name = dict((route["name"], route) for route in self.routes) # type: Dict[str, Dict[str, str]]
|
|
206
|
+
self.current_routes = {} # type: Dict[str, str]
|
|
207
|
+
self.route_modes = dict((host, "auto") for host in self.config["monitor"]) # type: Dict[str, str]
|
|
208
|
+
self.override_routes = {} # type: Dict[str, str]
|
|
209
|
+
self.switching_enabled = dict((host, True) for host in self.config["monitor"]) # type: Dict[str, bool]
|
|
210
|
+
self.last_probe_snapshot = {} # type: Dict[str, Dict[str, Dict[str, Any]]]
|
|
211
|
+
self._state_lock = None # type: Optional[asyncio.Lock]
|
|
212
|
+
self._admin_server = None # type: Optional[asyncio.base_events.Server]
|
|
213
|
+
|
|
214
|
+
def _get_state_lock(self) -> asyncio.Lock:
|
|
215
|
+
if self._state_lock is None:
|
|
216
|
+
self._state_lock = asyncio.Lock()
|
|
217
|
+
return self._state_lock
|
|
218
|
+
|
|
219
|
+
def run(self):
|
|
220
|
+
"""
|
|
221
|
+
Runs the main loop
|
|
222
|
+
|
|
223
|
+
Parameters:
|
|
224
|
+
None
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
None
|
|
228
|
+
"""
|
|
229
|
+
if self.config.get("delete_preadded_routes"):
|
|
230
|
+
self.clear_routes()
|
|
231
|
+
asyncio.run(self.run_service())
|
|
232
|
+
|
|
233
|
+
async def run_service(self) -> None:
|
|
234
|
+
stop_event = asyncio.Event()
|
|
235
|
+
loop = asyncio.get_running_loop()
|
|
236
|
+
can_handle_signals = hasattr(loop, "add_signal_handler")
|
|
237
|
+
|
|
238
|
+
if can_handle_signals:
|
|
239
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
240
|
+
loop.add_signal_handler(sig, stop_event.set)
|
|
241
|
+
|
|
242
|
+
await self._start_admin_server()
|
|
243
|
+
try:
|
|
244
|
+
await self.run_async(stop_event)
|
|
245
|
+
finally:
|
|
246
|
+
if can_handle_signals:
|
|
247
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
248
|
+
loop.remove_signal_handler(sig)
|
|
249
|
+
await self._stop_admin_server()
|
|
250
|
+
|
|
251
|
+
async def _start_admin_server(self) -> None:
|
|
252
|
+
socket_path = self.config["admin_socket_path"]
|
|
253
|
+
socket_dir = os.path.dirname(socket_path)
|
|
254
|
+
if socket_dir:
|
|
255
|
+
os.makedirs(socket_dir, exist_ok=True)
|
|
256
|
+
|
|
257
|
+
if os.path.exists(socket_path):
|
|
258
|
+
mode = os.stat(socket_path).st_mode
|
|
259
|
+
if stat.S_ISSOCK(mode):
|
|
260
|
+
os.unlink(socket_path)
|
|
261
|
+
else:
|
|
262
|
+
raise RuntimeError("admin_socket_path exists and is not a socket: %s" % socket_path)
|
|
263
|
+
|
|
264
|
+
self._admin_server = await asyncio.start_unix_server(self._handle_admin_client, path=socket_path)
|
|
265
|
+
os.chmod(socket_path, 0o600)
|
|
266
|
+
logging.info("Admin socket listening at %s", socket_path)
|
|
267
|
+
|
|
268
|
+
async def _stop_admin_server(self) -> None:
|
|
269
|
+
if self._admin_server is not None:
|
|
270
|
+
self._admin_server.close()
|
|
271
|
+
await self._admin_server.wait_closed()
|
|
272
|
+
self._admin_server = None
|
|
273
|
+
|
|
274
|
+
socket_path = self.config["admin_socket_path"]
|
|
275
|
+
if os.path.exists(socket_path):
|
|
276
|
+
mode = os.stat(socket_path).st_mode
|
|
277
|
+
if stat.S_ISSOCK(mode):
|
|
278
|
+
os.unlink(socket_path)
|
|
279
|
+
|
|
280
|
+
async def _handle_admin_client(self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
|
|
281
|
+
response = {"ok": False, "error": "empty request"}
|
|
282
|
+
try:
|
|
283
|
+
line = await reader.readline()
|
|
284
|
+
if line:
|
|
285
|
+
try:
|
|
286
|
+
request = json.loads(line.decode("utf-8"))
|
|
287
|
+
except ValueError:
|
|
288
|
+
response = {"ok": False, "error": "invalid JSON request"}
|
|
289
|
+
else:
|
|
290
|
+
response = await self._handle_admin_action(request)
|
|
291
|
+
except Exception as exc:
|
|
292
|
+
logging.exception("Admin request failed")
|
|
293
|
+
response = {"ok": False, "error": str(exc)}
|
|
294
|
+
|
|
295
|
+
writer.write((json.dumps(response) + "\n").encode("utf-8"))
|
|
296
|
+
await writer.drain()
|
|
297
|
+
writer.close()
|
|
298
|
+
await writer.wait_closed()
|
|
299
|
+
|
|
300
|
+
async def _build_status_data(self) -> Dict[str, Any]:
|
|
301
|
+
async with self._get_state_lock():
|
|
302
|
+
hosts = []
|
|
303
|
+
for host in self.config["monitor"]:
|
|
304
|
+
hosts.append(
|
|
305
|
+
{
|
|
306
|
+
"host": host,
|
|
307
|
+
"mode": self.route_modes.get(host, "auto"),
|
|
308
|
+
"switching_enabled": bool(self.switching_enabled.get(host, True)),
|
|
309
|
+
"current_route": self.current_routes.get(host),
|
|
310
|
+
"override_route": self.override_routes.get(host),
|
|
311
|
+
"routes": self.last_probe_snapshot.get(host, {}),
|
|
312
|
+
}
|
|
313
|
+
)
|
|
314
|
+
return {"hosts": hosts}
|
|
315
|
+
|
|
316
|
+
async def _handle_admin_action(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
317
|
+
if not isinstance(request, dict):
|
|
318
|
+
return {"ok": False, "error": "request must be a JSON object"}
|
|
319
|
+
|
|
320
|
+
action = request.get("action")
|
|
321
|
+
if action == "status":
|
|
322
|
+
return {"ok": True, "data": await self._build_status_data()}
|
|
323
|
+
|
|
324
|
+
if action == "override":
|
|
325
|
+
host = request.get("host")
|
|
326
|
+
route = request.get("route")
|
|
327
|
+
if not isinstance(host, str) or not isinstance(route, str):
|
|
328
|
+
return {"ok": False, "error": "host and route must be strings"}
|
|
329
|
+
if host not in self.config["monitor"]:
|
|
330
|
+
return {"ok": False, "error": "unknown host '%s'" % host}
|
|
331
|
+
if route not in self.routes_by_name:
|
|
332
|
+
return {"ok": False, "error": "unknown route '%s'" % route}
|
|
333
|
+
async with self._get_state_lock():
|
|
334
|
+
self.route_modes[host] = "override"
|
|
335
|
+
self.switching_enabled[host] = True
|
|
336
|
+
self.override_routes[host] = route
|
|
337
|
+
self.apply_route_config(host, route)
|
|
338
|
+
return {"ok": True, "data": {"host": host, "mode": "override", "route": route}}
|
|
339
|
+
|
|
340
|
+
if action == "disable_switching":
|
|
341
|
+
targets = self._resolve_targets(request)
|
|
342
|
+
if targets is None:
|
|
343
|
+
return {"ok": False, "error": "set either host or all=true"}
|
|
344
|
+
async with self._get_state_lock():
|
|
345
|
+
for host in targets:
|
|
346
|
+
self.switching_enabled[host] = False
|
|
347
|
+
if self.route_modes.get(host) != "override":
|
|
348
|
+
self.route_modes[host] = "frozen"
|
|
349
|
+
return {"ok": True, "data": {"hosts": targets, "mode": "frozen"}}
|
|
350
|
+
|
|
351
|
+
if action == "reset_auto":
|
|
352
|
+
targets = self._resolve_targets(request)
|
|
353
|
+
if targets is None:
|
|
354
|
+
return {"ok": False, "error": "set either host or all=true"}
|
|
355
|
+
async with self._get_state_lock():
|
|
356
|
+
for host in targets:
|
|
357
|
+
self.switching_enabled[host] = True
|
|
358
|
+
self.route_modes[host] = "auto"
|
|
359
|
+
self.override_routes.pop(host, None)
|
|
360
|
+
return {"ok": True, "data": {"hosts": targets, "mode": "auto"}}
|
|
361
|
+
|
|
362
|
+
return {"ok": False, "error": "unsupported action '%s'" % action}
|
|
363
|
+
|
|
364
|
+
def _resolve_targets(self, request: Dict[str, Any]) -> Optional[List[str]]:
|
|
365
|
+
if request.get("all") is True:
|
|
366
|
+
return list(self.config["monitor"])
|
|
367
|
+
|
|
368
|
+
host = request.get("host")
|
|
369
|
+
if isinstance(host, str) and host in self.config["monitor"]:
|
|
370
|
+
return [host]
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
def _log_cmd(self, cmd: List[str]) -> None:
|
|
374
|
+
logging.debug("cmd: %s", " ".join(shlex.quote(part) for part in cmd))
|
|
375
|
+
|
|
376
|
+
def _run_ip(self, args: List[str]) -> Tuple[bool, str]:
|
|
377
|
+
cmd = [self.config["ip_bin"]] + args
|
|
378
|
+
self._log_cmd(cmd)
|
|
379
|
+
try:
|
|
380
|
+
completed = subprocess.run(
|
|
381
|
+
cmd,
|
|
382
|
+
check=False,
|
|
383
|
+
stdout=subprocess.PIPE,
|
|
384
|
+
stderr=subprocess.PIPE,
|
|
385
|
+
universal_newlines=True,
|
|
386
|
+
)
|
|
387
|
+
except Exception as exc:
|
|
388
|
+
logging.exception(exc)
|
|
389
|
+
return False, str(exc)
|
|
390
|
+
|
|
391
|
+
if completed.returncode == 0:
|
|
392
|
+
output = (completed.stdout or "").strip()
|
|
393
|
+
if output:
|
|
394
|
+
logging.debug(output)
|
|
395
|
+
return True, ""
|
|
396
|
+
|
|
397
|
+
stderr = (completed.stderr or "").strip()
|
|
398
|
+
stdout = (completed.stdout or "").strip()
|
|
399
|
+
error_text = stderr or stdout or ("exit code %s" % completed.returncode)
|
|
400
|
+
return False, error_text
|
|
401
|
+
|
|
402
|
+
def clear_routes(self):
|
|
403
|
+
"""
|
|
404
|
+
Clears the routes that are not needed.
|
|
405
|
+
|
|
406
|
+
Iterates over the source IP addresses specified in the configuration file and checks if they are present in the routes. If a route has a gateway IP that is not in the list of source IP addresses, it is removed. Additionally, if the destination IP of the route is in the list of IP addresses to monitor, it is also removed.
|
|
407
|
+
|
|
408
|
+
Parameters:
|
|
409
|
+
None
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
None
|
|
413
|
+
"""
|
|
414
|
+
hosts = set()
|
|
415
|
+
for host in self.config["monitor"]:
|
|
416
|
+
hosts.add(host)
|
|
417
|
+
if host in self.config.get("also_route", {}):
|
|
418
|
+
hosts.update(self.config["also_route"][host])
|
|
419
|
+
|
|
420
|
+
for host in hosts:
|
|
421
|
+
self.clear_route(host)
|
|
422
|
+
|
|
423
|
+
self.current_routes = {}
|
|
424
|
+
|
|
425
|
+
# set fallback routes as no route set
|
|
426
|
+
for host, gateway in self.config.get("fallback_routes", {}).items():
|
|
427
|
+
self.apply_route_config(host, gateway)
|
|
428
|
+
|
|
429
|
+
def clear_route(self, host: str) -> None:
|
|
430
|
+
"""
|
|
431
|
+
Removes the route for the given host.
|
|
432
|
+
|
|
433
|
+
Parameters:
|
|
434
|
+
host (str): The host to remove the route for.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
None
|
|
438
|
+
"""
|
|
439
|
+
|
|
440
|
+
logging.info("Remove %s", host)
|
|
441
|
+
ok, error_text = self._run_ip(["route", "del", "%s/32" % host])
|
|
442
|
+
if ok or "RTNETLINK answers: No such process" in error_text:
|
|
443
|
+
return
|
|
444
|
+
logging.error("Failed to remove route for %s: %s", host, error_text)
|
|
445
|
+
|
|
446
|
+
def _route_cmd(self, action: str, destination: str, route: Dict[str, str]) -> List[str]:
|
|
447
|
+
cmd = [
|
|
448
|
+
"route",
|
|
449
|
+
action,
|
|
450
|
+
"%s/32" % destination,
|
|
451
|
+
"via",
|
|
452
|
+
route["gateway"],
|
|
453
|
+
"dev",
|
|
454
|
+
route["device"],
|
|
455
|
+
]
|
|
456
|
+
if route.get("probe_source"):
|
|
457
|
+
cmd.extend(["src", route["probe_source"]])
|
|
458
|
+
return cmd
|
|
459
|
+
|
|
460
|
+
def apply_route_config(self, host: str, route_name: str) -> None:
|
|
461
|
+
"""
|
|
462
|
+
Applies the route configuration.
|
|
463
|
+
|
|
464
|
+
Adds the route to the routing table for the given host and gateway.
|
|
465
|
+
Additionally, if the also_route configuration is specified, it will also add the route for
|
|
466
|
+
the given host to the routing table of the specified hosts.
|
|
467
|
+
|
|
468
|
+
Parameters:
|
|
469
|
+
host (str): The host to add the route for.
|
|
470
|
+
route_name (str): The route candidate name to use.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
None
|
|
474
|
+
"""
|
|
475
|
+
route = self.routes_by_name.get(route_name)
|
|
476
|
+
if route is None:
|
|
477
|
+
logging.error("Unknown route '%s' for host '%s'", route_name, host)
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
hosts_to_add = [host] + self.config.get("also_route", {}).get(host, [])
|
|
481
|
+
|
|
482
|
+
logging.info("Apply %s => %s", host, route_name)
|
|
483
|
+
for destination in hosts_to_add:
|
|
484
|
+
if destination not in self.current_routes:
|
|
485
|
+
ok, error_text = self._run_ip(self._route_cmd("add", destination, route))
|
|
486
|
+
if ok:
|
|
487
|
+
self.current_routes[destination] = route_name
|
|
488
|
+
continue
|
|
489
|
+
if "RTNETLINK answers: File exists" not in error_text:
|
|
490
|
+
logging.error("Failed to add route for %s: %s", destination, error_text)
|
|
491
|
+
continue
|
|
492
|
+
|
|
493
|
+
ok, error_text = self._run_ip(self._route_cmd("replace", destination, route))
|
|
494
|
+
if not ok:
|
|
495
|
+
logging.error("Failed to replace route for %s: %s", destination, error_text)
|
|
496
|
+
continue
|
|
497
|
+
self.current_routes[destination] = route_name
|
|
498
|
+
|
|
499
|
+
async def run_async(self, stop_event: Optional[asyncio.Event] = None):
|
|
500
|
+
"""
|
|
501
|
+
Runs the main loop of the optimizer.
|
|
502
|
+
|
|
503
|
+
The loop is responsible for sending ICMP requests to the hosts and setting the routing based on the results.
|
|
504
|
+
|
|
505
|
+
Parameters:
|
|
506
|
+
None
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
None
|
|
510
|
+
"""
|
|
511
|
+
checks = 0
|
|
512
|
+
sums = {} # host -> route_name -> {"rtt": sum, "loss": sum}
|
|
513
|
+
while not (stop_event is not None and stop_event.is_set()):
|
|
514
|
+
# send ICMP requests
|
|
515
|
+
tasks = []
|
|
516
|
+
route_names = []
|
|
517
|
+
for route in self.routes:
|
|
518
|
+
tasks.append(
|
|
519
|
+
asyncio.create_task(
|
|
520
|
+
async_multiping(
|
|
521
|
+
self.config["monitor"],
|
|
522
|
+
count=self.config["test_count"],
|
|
523
|
+
source=route["probe_source"],
|
|
524
|
+
interval=self.config["test_interval"],
|
|
525
|
+
)
|
|
526
|
+
)
|
|
527
|
+
)
|
|
528
|
+
route_names.append(route["name"])
|
|
529
|
+
|
|
530
|
+
# wait and aggregate results
|
|
531
|
+
result = await asyncio.gather(*tasks, return_exceptions=True)
|
|
532
|
+
|
|
533
|
+
# process results
|
|
534
|
+
host_data = {}
|
|
535
|
+
sources_up = set() # route names with successful probes
|
|
536
|
+
probe_snapshot = {}
|
|
537
|
+
for x, hosts in enumerate(result):
|
|
538
|
+
source = route_names[x]
|
|
539
|
+
if isinstance(hosts, BaseException):
|
|
540
|
+
logging.warning("%s: probe failed: %s", source, hosts)
|
|
541
|
+
continue
|
|
542
|
+
if not isinstance(hosts, list):
|
|
543
|
+
logging.warning("%s: probe returned unexpected payload type: %s", source, type(hosts).__name__)
|
|
544
|
+
continue
|
|
545
|
+
for host in hosts:
|
|
546
|
+
if host.address not in probe_snapshot:
|
|
547
|
+
probe_snapshot[host.address] = {}
|
|
548
|
+
probe_snapshot[host.address][source] = {
|
|
549
|
+
"avg_rtt": host.avg_rtt,
|
|
550
|
+
"avg_loss": host.packet_loss,
|
|
551
|
+
"is_alive": bool(host.is_alive),
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
if not host.is_alive:
|
|
555
|
+
continue
|
|
556
|
+
|
|
557
|
+
sources_up.add(source)
|
|
558
|
+
|
|
559
|
+
if host.address not in sums:
|
|
560
|
+
sums[host.address] = {}
|
|
561
|
+
|
|
562
|
+
if source not in sums[host.address]:
|
|
563
|
+
sums[host.address][source] = {"rtt": 0, "loss": 0}
|
|
564
|
+
|
|
565
|
+
if host.address not in host_data:
|
|
566
|
+
host_data[host.address] = []
|
|
567
|
+
|
|
568
|
+
sums[host.address][source]["rtt"] += host.avg_rtt
|
|
569
|
+
sums[host.address][source]["loss"] += host.packet_loss
|
|
570
|
+
|
|
571
|
+
host_data[host.address].append((source, host.avg_rtt, host.packet_loss))
|
|
572
|
+
|
|
573
|
+
async with self._get_state_lock():
|
|
574
|
+
self.last_probe_snapshot = probe_snapshot
|
|
575
|
+
|
|
576
|
+
# sort by newest
|
|
577
|
+
force_reset = False
|
|
578
|
+
for host, results in host_data.items():
|
|
579
|
+
if host in self.current_routes and self.current_routes[host] not in sources_up:
|
|
580
|
+
force_reset = True
|
|
581
|
+
host_data[host] = sorted(results, key=lambda y: (y[2], y[1]))
|
|
582
|
+
logging.debug("%s: %s", host, host_data[host])
|
|
583
|
+
|
|
584
|
+
# apply routes
|
|
585
|
+
checks += 1
|
|
586
|
+
valid_source_found = []
|
|
587
|
+
if checks >= self.config["test_count"] or force_reset or not self.current_routes:
|
|
588
|
+
for host, results in sums.items():
|
|
589
|
+
host_data = []
|
|
590
|
+
for source, metrics in results.items():
|
|
591
|
+
avg_rtt = metrics["rtt"] / checks
|
|
592
|
+
avg_loss = metrics["loss"] / checks
|
|
593
|
+
host_data.append((source, avg_rtt, avg_loss))
|
|
594
|
+
logging.debug("%s: %s: %s %s", host, source, avg_rtt, avg_loss)
|
|
595
|
+
|
|
596
|
+
host_data = sorted(host_data, key=lambda y: (y[2], y[1]))[0]
|
|
597
|
+
current_route = self.current_routes.get(host)
|
|
598
|
+
async with self._get_state_lock():
|
|
599
|
+
mode = self.route_modes.get(host, "auto")
|
|
600
|
+
switching_enabled = bool(self.switching_enabled.get(host, True))
|
|
601
|
+
override_route = self.override_routes.get(host)
|
|
602
|
+
|
|
603
|
+
if mode == "override" and override_route:
|
|
604
|
+
valid_source_found.append(host)
|
|
605
|
+
if current_route != override_route:
|
|
606
|
+
self.apply_route_config(host, override_route)
|
|
607
|
+
continue
|
|
608
|
+
|
|
609
|
+
if mode == "frozen" or not switching_enabled:
|
|
610
|
+
valid_source_found.append(host)
|
|
611
|
+
continue
|
|
612
|
+
|
|
613
|
+
# no routing set
|
|
614
|
+
if current_route is None or current_route not in results:
|
|
615
|
+
valid_source_found.append(host)
|
|
616
|
+
self.apply_route_config(host, host_data[0])
|
|
617
|
+
continue
|
|
618
|
+
|
|
619
|
+
# no change
|
|
620
|
+
if current_route == host_data[0]:
|
|
621
|
+
valid_source_found.append(host)
|
|
622
|
+
logging.debug("%s: Current route is already the fastest route", host)
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
# paketloss
|
|
626
|
+
current_loss = results[current_route]["loss"] / checks
|
|
627
|
+
current_rtt = results[current_route]["rtt"] / checks
|
|
628
|
+
if current_loss > self.config["packet_loss_threshold"]:
|
|
629
|
+
logging.warning("%s: Current route has paketloss, need to switch", host)
|
|
630
|
+
else:
|
|
631
|
+
# check rtt difference between current and fastest route
|
|
632
|
+
rtt_diff = current_rtt - host_data[1]
|
|
633
|
+
logging.debug(
|
|
634
|
+
"%s: rtt_diff: %s, (%s) %s (%s) %s ",
|
|
635
|
+
host,
|
|
636
|
+
rtt_diff,
|
|
637
|
+
current_route,
|
|
638
|
+
current_rtt,
|
|
639
|
+
host_data[0],
|
|
640
|
+
host_data[1],
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
if rtt_diff < self.config["rtt_threshold"]:
|
|
644
|
+
valid_source_found.append(host)
|
|
645
|
+
logging.info(
|
|
646
|
+
"%s: Route not changed to %s, rtt difference %s < threshold %s",
|
|
647
|
+
host,
|
|
648
|
+
host_data[0],
|
|
649
|
+
round(rtt_diff, 3),
|
|
650
|
+
self.config["rtt_threshold"],
|
|
651
|
+
)
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
valid_source_found.append(host)
|
|
655
|
+
self.apply_route_config(host, host_data[0])
|
|
656
|
+
|
|
657
|
+
checks = 0
|
|
658
|
+
sums = {}
|
|
659
|
+
|
|
660
|
+
for sip in self.config["monitor"]:
|
|
661
|
+
if sip in valid_source_found:
|
|
662
|
+
continue
|
|
663
|
+
logging.warning("No valid source found for %s", sip)
|
|
664
|
+
|
|
665
|
+
# fallback routes
|
|
666
|
+
if self.config.get("fallback_routes", {}).get(sip):
|
|
667
|
+
self.apply_route_config(sip, self.config.get("fallback_routes", {}).get(sip))
|
|
668
|
+
else:
|
|
669
|
+
logging.warning("No fallback routes configured for %s", sip)
|
|
670
|
+
self.clear_route(sip)
|
|
671
|
+
|
|
672
|
+
if stop_event is None:
|
|
673
|
+
await asyncio.sleep(self.config["scan_interval"])
|
|
674
|
+
continue
|
|
675
|
+
|
|
676
|
+
try:
|
|
677
|
+
await asyncio.wait_for(stop_event.wait(), timeout=self.config["scan_interval"])
|
|
678
|
+
except asyncio.TimeoutError:
|
|
679
|
+
pass
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def main() -> None:
|
|
683
|
+
parser = argparse.ArgumentParser(
|
|
684
|
+
prog="Lowest Latency Routes Optimizer",
|
|
685
|
+
description="Sends ICMP requests to list of given hosts and set static routing for the fastest response",
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
parser.add_argument("--config", type=str, help="Path to config file", required=True)
|
|
689
|
+
|
|
690
|
+
args = parser.parse_args()
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
with open(args.config, "r", encoding="utf-8") as stream:
|
|
694
|
+
try:
|
|
695
|
+
config = yaml.safe_load(stream)
|
|
696
|
+
except yaml.YAMLError as ex:
|
|
697
|
+
logging.exception(ex)
|
|
698
|
+
sys.exit(1)
|
|
699
|
+
except Exception as e:
|
|
700
|
+
logging.exception(e)
|
|
701
|
+
sys.exit(1)
|
|
702
|
+
|
|
703
|
+
if not config:
|
|
704
|
+
logging.error("Config could not be parsed")
|
|
705
|
+
sys.exit(1)
|
|
706
|
+
|
|
707
|
+
try:
|
|
708
|
+
llro_instance = LowestLatencyRoutesOptimizer(config)
|
|
709
|
+
except ConfigError as exc:
|
|
710
|
+
logging.error("Invalid configuration: %s", exc)
|
|
711
|
+
sys.exit(1)
|
|
712
|
+
|
|
713
|
+
if llro_instance.config.get("debug"):
|
|
714
|
+
logging.getLogger("root").setLevel(logging.DEBUG)
|
|
715
|
+
|
|
716
|
+
llro_instance.run()
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
if __name__ == "__main__":
|
|
720
|
+
main()
|
llro_cli.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
import socket
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
from llro import DEFAULT_ADMIN_SOCKET_PATH
|
|
9
|
+
|
|
10
|
+
DEFAULT_SOCKET_TIMEOUT_SECONDS = 5.0
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _send_request(
|
|
14
|
+
socket_path: str, payload: Dict[str, Any], timeout_seconds: float = DEFAULT_SOCKET_TIMEOUT_SECONDS
|
|
15
|
+
) -> Dict[str, Any]:
|
|
16
|
+
message = (json.dumps(payload) + "\n").encode("utf-8")
|
|
17
|
+
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as client:
|
|
18
|
+
client.settimeout(timeout_seconds)
|
|
19
|
+
try:
|
|
20
|
+
client.connect(socket_path)
|
|
21
|
+
except socket.timeout:
|
|
22
|
+
raise RuntimeError("connection to %s timed out after %.1fs" % (socket_path, timeout_seconds))
|
|
23
|
+
except OSError as exc:
|
|
24
|
+
raise RuntimeError("failed to connect to %s: %s" % (socket_path, exc))
|
|
25
|
+
|
|
26
|
+
client.sendall(message)
|
|
27
|
+
chunks = []
|
|
28
|
+
while True:
|
|
29
|
+
try:
|
|
30
|
+
chunk = client.recv(4096)
|
|
31
|
+
except socket.timeout:
|
|
32
|
+
raise RuntimeError("read from %s timed out after %.1fs" % (socket_path, timeout_seconds))
|
|
33
|
+
if not chunk:
|
|
34
|
+
break
|
|
35
|
+
chunks.append(chunk)
|
|
36
|
+
|
|
37
|
+
if not chunks:
|
|
38
|
+
raise RuntimeError("empty response from daemon")
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
return json.loads(b"".join(chunks).decode("utf-8").strip())
|
|
42
|
+
except ValueError as exc:
|
|
43
|
+
raise RuntimeError("invalid response from daemon: %s" % exc)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _format_status_table(hosts: List[Dict[str, Any]]) -> str:
|
|
47
|
+
lines = []
|
|
48
|
+
for item in hosts:
|
|
49
|
+
host = item.get("host")
|
|
50
|
+
mode = item.get("mode")
|
|
51
|
+
current_route = item.get("current_route") or "-"
|
|
52
|
+
override_route = item.get("override_route") or "-"
|
|
53
|
+
switching_enabled = "yes" if item.get("switching_enabled") else "no"
|
|
54
|
+
lines.append(
|
|
55
|
+
"Host %s | mode=%s | switching=%s | current=%s | override=%s"
|
|
56
|
+
% (host, mode, switching_enabled, current_route, override_route)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
routes = item.get("routes") or {}
|
|
60
|
+
if not routes:
|
|
61
|
+
lines.append(" (no probe data yet)")
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
route_names = sorted(routes.keys())
|
|
65
|
+
for route_name in route_names:
|
|
66
|
+
route_data = routes[route_name]
|
|
67
|
+
avg_rtt = route_data.get("avg_rtt")
|
|
68
|
+
avg_loss = route_data.get("avg_loss")
|
|
69
|
+
is_alive = "yes" if route_data.get("is_alive") else "no"
|
|
70
|
+
lines.append(" %s: rtt=%s ms, loss=%s%%, alive=%s" % (route_name, avg_rtt, avg_loss, is_alive))
|
|
71
|
+
return "\n".join(lines)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
75
|
+
parser = argparse.ArgumentParser(
|
|
76
|
+
prog="llro-cli",
|
|
77
|
+
description="Admin client for LLRO daemon over Unix socket",
|
|
78
|
+
)
|
|
79
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
80
|
+
|
|
81
|
+
status = subparsers.add_parser("status", help="show daemon routing status")
|
|
82
|
+
status.add_argument("--socket", default=DEFAULT_ADMIN_SOCKET_PATH, help="admin socket path")
|
|
83
|
+
status.add_argument("--json", action="store_true", dest="as_json", help="print raw JSON")
|
|
84
|
+
|
|
85
|
+
override = subparsers.add_parser("override", help="override route for a host")
|
|
86
|
+
override.add_argument("--socket", default=DEFAULT_ADMIN_SOCKET_PATH, help="admin socket path")
|
|
87
|
+
override.add_argument("--host", required=True, help="monitored host to control")
|
|
88
|
+
override.add_argument("--route", required=True, help="route name to pin")
|
|
89
|
+
|
|
90
|
+
disable_switching = subparsers.add_parser("disable-switching", help="disable route switching")
|
|
91
|
+
disable_switching.add_argument("--socket", default=DEFAULT_ADMIN_SOCKET_PATH, help="admin socket path")
|
|
92
|
+
target_disable = disable_switching.add_mutually_exclusive_group(required=True)
|
|
93
|
+
target_disable.add_argument("--host", help="disable switching for one monitored host")
|
|
94
|
+
target_disable.add_argument("--all", action="store_true", help="disable switching for all monitored hosts")
|
|
95
|
+
|
|
96
|
+
reset_auto = subparsers.add_parser("reset-auto", help="reset control mode back to auto routing")
|
|
97
|
+
reset_auto.add_argument("--socket", default=DEFAULT_ADMIN_SOCKET_PATH, help="admin socket path")
|
|
98
|
+
target_reset = reset_auto.add_mutually_exclusive_group(required=True)
|
|
99
|
+
target_reset.add_argument("--host", help="reset one monitored host")
|
|
100
|
+
target_reset.add_argument("--all", action="store_true", help="reset all monitored hosts")
|
|
101
|
+
|
|
102
|
+
return parser
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _make_payload(args: argparse.Namespace) -> Dict[str, Any]:
|
|
106
|
+
if args.command == "status":
|
|
107
|
+
return {"action": "status"}
|
|
108
|
+
|
|
109
|
+
if args.command == "override":
|
|
110
|
+
return {"action": "override", "host": args.host, "route": args.route}
|
|
111
|
+
|
|
112
|
+
if args.command == "disable-switching":
|
|
113
|
+
payload = {"action": "disable_switching"} # type: Dict[str, Any]
|
|
114
|
+
if args.all:
|
|
115
|
+
payload["all"] = True
|
|
116
|
+
else:
|
|
117
|
+
payload["host"] = args.host
|
|
118
|
+
return payload
|
|
119
|
+
|
|
120
|
+
if args.command == "reset-auto":
|
|
121
|
+
payload = {"action": "reset_auto"} # type: Dict[str, Any]
|
|
122
|
+
if args.all:
|
|
123
|
+
payload["all"] = True
|
|
124
|
+
else:
|
|
125
|
+
payload["host"] = args.host
|
|
126
|
+
return payload
|
|
127
|
+
|
|
128
|
+
raise RuntimeError("unknown command")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def main() -> None:
|
|
132
|
+
parser = _build_parser()
|
|
133
|
+
args = parser.parse_args()
|
|
134
|
+
payload = _make_payload(args)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
response = _send_request(args.socket, payload)
|
|
138
|
+
except RuntimeError as exc:
|
|
139
|
+
print(str(exc), file=sys.stderr)
|
|
140
|
+
sys.exit(1)
|
|
141
|
+
|
|
142
|
+
if not response.get("ok"):
|
|
143
|
+
print(response.get("error") or "request failed", file=sys.stderr)
|
|
144
|
+
sys.exit(1)
|
|
145
|
+
|
|
146
|
+
if args.command == "status":
|
|
147
|
+
data = response.get("data") or {}
|
|
148
|
+
if args.as_json:
|
|
149
|
+
print(json.dumps(data, indent=2, sort_keys=True))
|
|
150
|
+
return
|
|
151
|
+
print(_format_status_table(data.get("hosts") or []))
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
data = response.get("data") or {}
|
|
155
|
+
print(json.dumps(data, sort_keys=True))
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == "__main__":
|
|
159
|
+
main()
|