flwr-nightly 1.13.0.dev20241028__py3-none-any.whl → 1.13.0.dev20241029__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/common/date.py +3 -3
- flwr/common/logger.py +31 -0
- flwr/common/serde.py +22 -0
- flwr/proto/driver_pb2.py +24 -23
- flwr/proto/driver_pb2_grpc.py +69 -0
- flwr/proto/driver_pb2_grpc.pyi +27 -0
- flwr/proto/log_pb2.py +29 -0
- flwr/proto/log_pb2.pyi +39 -0
- flwr/proto/log_pb2_grpc.py +4 -0
- flwr/proto/log_pb2_grpc.pyi +4 -0
- flwr/server/driver/driver.py +14 -0
- flwr/server/driver/grpc_driver.py +8 -15
- flwr/server/driver/inmemory_driver.py +3 -11
- flwr/server/run_serverapp.py +3 -4
- flwr/server/serverapp/app.py +119 -7
- flwr/server/superlink/driver/driver_servicer.py +27 -0
- flwr/server/superlink/linkstate/in_memory_linkstate.py +28 -2
- flwr/server/superlink/linkstate/linkstate.py +35 -0
- flwr/server/superlink/linkstate/sqlite_linkstate.py +50 -0
- flwr/simulation/run_simulation.py +2 -1
- flwr/superexec/deployment.py +3 -37
- flwr/superexec/exec_servicer.py +5 -72
- flwr/superexec/executor.py +3 -4
- flwr/superexec/simulation.py +4 -7
- {flwr_nightly-1.13.0.dev20241028.dist-info → flwr_nightly-1.13.0.dev20241029.dist-info}/METADATA +1 -1
- {flwr_nightly-1.13.0.dev20241028.dist-info → flwr_nightly-1.13.0.dev20241029.dist-info}/RECORD +29 -25
- {flwr_nightly-1.13.0.dev20241028.dist-info → flwr_nightly-1.13.0.dev20241029.dist-info}/LICENSE +0 -0
- {flwr_nightly-1.13.0.dev20241028.dist-info → flwr_nightly-1.13.0.dev20241029.dist-info}/WHEEL +0 -0
- {flwr_nightly-1.13.0.dev20241028.dist-info → flwr_nightly-1.13.0.dev20241029.dist-info}/entry_points.txt +0 -0
flwr/server/serverapp/app.py
CHANGED
|
@@ -16,13 +16,38 @@
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
import sys
|
|
19
|
-
from logging import DEBUG, INFO, WARN
|
|
19
|
+
from logging import DEBUG, ERROR, INFO, WARN
|
|
20
20
|
from os.path import isfile
|
|
21
21
|
from pathlib import Path
|
|
22
|
+
from time import sleep
|
|
22
23
|
from typing import Optional
|
|
23
24
|
|
|
25
|
+
from flwr.cli.config_utils import get_fab_metadata
|
|
26
|
+
from flwr.cli.install import install_from_fab
|
|
27
|
+
from flwr.common.config import (
|
|
28
|
+
get_flwr_dir,
|
|
29
|
+
get_fused_config_from_dir,
|
|
30
|
+
get_project_config,
|
|
31
|
+
get_project_dir,
|
|
32
|
+
)
|
|
33
|
+
from flwr.common.constant import Status, SubStatus
|
|
24
34
|
from flwr.common.logger import log
|
|
35
|
+
from flwr.common.serde import (
|
|
36
|
+
context_from_proto,
|
|
37
|
+
context_to_proto,
|
|
38
|
+
fab_from_proto,
|
|
39
|
+
run_from_proto,
|
|
40
|
+
run_status_to_proto,
|
|
41
|
+
)
|
|
42
|
+
from flwr.common.typing import RunStatus
|
|
43
|
+
from flwr.proto.driver_pb2 import ( # pylint: disable=E0611
|
|
44
|
+
PullServerAppInputsRequest,
|
|
45
|
+
PullServerAppInputsResponse,
|
|
46
|
+
PushServerAppOutputsRequest,
|
|
47
|
+
)
|
|
48
|
+
from flwr.proto.run_pb2 import UpdateRunStatusRequest # pylint: disable=E0611
|
|
25
49
|
from flwr.server.driver.grpc_driver import GrpcDriver
|
|
50
|
+
from flwr.server.run_serverapp import run as run_
|
|
26
51
|
|
|
27
52
|
|
|
28
53
|
def flwr_serverapp() -> None:
|
|
@@ -121,21 +146,108 @@ def _try_obtain_certificates(
|
|
|
121
146
|
return root_certificates
|
|
122
147
|
|
|
123
148
|
|
|
124
|
-
def run_serverapp( # pylint: disable=R0914
|
|
149
|
+
def run_serverapp( # pylint: disable=R0914, disable=W0212
|
|
125
150
|
superlink: str,
|
|
126
151
|
run_id: Optional[int] = None,
|
|
127
152
|
flwr_dir_: Optional[str] = None,
|
|
128
153
|
certificates: Optional[bytes] = None,
|
|
129
154
|
) -> None:
|
|
130
155
|
"""Run Flower ServerApp process."""
|
|
131
|
-
|
|
132
|
-
run_id=run_id if run_id else 0,
|
|
156
|
+
driver = GrpcDriver(
|
|
133
157
|
driver_service_address=superlink,
|
|
134
158
|
root_certificates=certificates,
|
|
135
159
|
)
|
|
136
160
|
|
|
137
|
-
|
|
161
|
+
# Resolve directory where FABs are installed
|
|
162
|
+
flwr_dir = get_flwr_dir(flwr_dir_)
|
|
138
163
|
|
|
139
|
-
|
|
164
|
+
only_once = run_id is not None
|
|
140
165
|
|
|
141
|
-
|
|
166
|
+
while True:
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
# Pull ServerAppInputs from LinkState
|
|
170
|
+
req = (
|
|
171
|
+
PullServerAppInputsRequest(run_id=run_id)
|
|
172
|
+
if run_id
|
|
173
|
+
else PullServerAppInputsRequest()
|
|
174
|
+
)
|
|
175
|
+
res: PullServerAppInputsResponse = driver._stub.PullServerAppInputs(req)
|
|
176
|
+
if not res.HasField("run"):
|
|
177
|
+
sleep(3)
|
|
178
|
+
run_status = None
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
context = context_from_proto(res.context)
|
|
182
|
+
run = run_from_proto(res.run)
|
|
183
|
+
fab = fab_from_proto(res.fab)
|
|
184
|
+
|
|
185
|
+
driver.init_run(run.run_id)
|
|
186
|
+
|
|
187
|
+
log(DEBUG, "ServerApp process starts FAB installation.")
|
|
188
|
+
install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
|
|
189
|
+
|
|
190
|
+
fab_id, fab_version = get_fab_metadata(fab.content)
|
|
191
|
+
|
|
192
|
+
app_path = str(get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir))
|
|
193
|
+
config = get_project_config(app_path)
|
|
194
|
+
|
|
195
|
+
# Obtain server app reference and the run config
|
|
196
|
+
server_app_attr = config["tool"]["flwr"]["app"]["components"]["serverapp"]
|
|
197
|
+
server_app_run_config = get_fused_config_from_dir(
|
|
198
|
+
Path(app_path), run.override_config
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Update run_config in context
|
|
202
|
+
context.run_config = server_app_run_config
|
|
203
|
+
|
|
204
|
+
log(
|
|
205
|
+
DEBUG,
|
|
206
|
+
"Flower will load ServerApp `%s` in %s",
|
|
207
|
+
server_app_attr,
|
|
208
|
+
app_path,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Change status to Running
|
|
212
|
+
run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
|
|
213
|
+
driver._stub.UpdateRunStatus(
|
|
214
|
+
UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Load and run the ServerApp with the Driver
|
|
218
|
+
updated_context = run_(
|
|
219
|
+
driver=driver,
|
|
220
|
+
server_app_dir=app_path,
|
|
221
|
+
server_app_attr=server_app_attr,
|
|
222
|
+
context=context,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Send resulting context
|
|
226
|
+
context_proto = context_to_proto(updated_context)
|
|
227
|
+
out_req = PushServerAppOutputsRequest(
|
|
228
|
+
run_id=run.run_id, context=context_proto
|
|
229
|
+
)
|
|
230
|
+
_ = driver._stub.PushServerAppOutputs(out_req)
|
|
231
|
+
|
|
232
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
|
|
233
|
+
|
|
234
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
235
|
+
exc_entity = "ServerApp"
|
|
236
|
+
log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
|
|
237
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
238
|
+
|
|
239
|
+
finally:
|
|
240
|
+
if run_status:
|
|
241
|
+
run_status_proto = run_status_to_proto(run_status)
|
|
242
|
+
driver._stub.UpdateRunStatus(
|
|
243
|
+
UpdateRunStatusRequest(
|
|
244
|
+
run_id=run.run_id, run_status=run_status_proto
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Stop the loop if `flwr-serverapp` is expected to process a single run
|
|
249
|
+
if only_once:
|
|
250
|
+
break
|
|
251
|
+
|
|
252
|
+
# Reset the run_id
|
|
253
|
+
run_id = None
|
|
@@ -31,6 +31,7 @@ from flwr.common.serde import (
|
|
|
31
31
|
context_to_proto,
|
|
32
32
|
fab_from_proto,
|
|
33
33
|
fab_to_proto,
|
|
34
|
+
run_status_from_proto,
|
|
34
35
|
run_to_proto,
|
|
35
36
|
user_config_from_proto,
|
|
36
37
|
)
|
|
@@ -49,12 +50,18 @@ from flwr.proto.driver_pb2 import ( # pylint: disable=E0611
|
|
|
49
50
|
PushTaskInsResponse,
|
|
50
51
|
)
|
|
51
52
|
from flwr.proto.fab_pb2 import GetFabRequest, GetFabResponse # pylint: disable=E0611
|
|
53
|
+
from flwr.proto.log_pb2 import ( # pylint: disable=E0611
|
|
54
|
+
PushLogsRequest,
|
|
55
|
+
PushLogsResponse,
|
|
56
|
+
)
|
|
52
57
|
from flwr.proto.node_pb2 import Node # pylint: disable=E0611
|
|
53
58
|
from flwr.proto.run_pb2 import ( # pylint: disable=E0611
|
|
54
59
|
CreateRunRequest,
|
|
55
60
|
CreateRunResponse,
|
|
56
61
|
GetRunRequest,
|
|
57
62
|
GetRunResponse,
|
|
63
|
+
UpdateRunStatusRequest,
|
|
64
|
+
UpdateRunStatusResponse,
|
|
58
65
|
)
|
|
59
66
|
from flwr.proto.task_pb2 import TaskRes # pylint: disable=E0611
|
|
60
67
|
from flwr.server.superlink.ffs.ffs import Ffs
|
|
@@ -253,6 +260,26 @@ class DriverServicer(driver_pb2_grpc.DriverServicer):
|
|
|
253
260
|
state.set_serverapp_context(request.run_id, context_from_proto(request.context))
|
|
254
261
|
return PushServerAppOutputsResponse()
|
|
255
262
|
|
|
263
|
+
def UpdateRunStatus(
|
|
264
|
+
self, request: UpdateRunStatusRequest, context: grpc.ServicerContext
|
|
265
|
+
) -> UpdateRunStatusResponse:
|
|
266
|
+
"""Update the status of a run."""
|
|
267
|
+
log(DEBUG, "ControlServicer.UpdateRunStatus")
|
|
268
|
+
state = self.state_factory.state()
|
|
269
|
+
|
|
270
|
+
# Update the run status
|
|
271
|
+
state.update_run_status(
|
|
272
|
+
run_id=request.run_id, new_status=run_status_from_proto(request.run_status)
|
|
273
|
+
)
|
|
274
|
+
return UpdateRunStatusResponse()
|
|
275
|
+
|
|
276
|
+
def PushLogs(
|
|
277
|
+
self, request: PushLogsRequest, context: grpc.ServicerContext
|
|
278
|
+
) -> PushLogsResponse:
|
|
279
|
+
"""Push logs."""
|
|
280
|
+
log(DEBUG, "DriverServicer.PushLogs")
|
|
281
|
+
raise NotImplementedError()
|
|
282
|
+
|
|
256
283
|
|
|
257
284
|
def _raise_if(validation_error: bool, detail: str) -> None:
|
|
258
285
|
if validation_error:
|
|
@@ -17,7 +17,8 @@
|
|
|
17
17
|
|
|
18
18
|
import threading
|
|
19
19
|
import time
|
|
20
|
-
from
|
|
20
|
+
from bisect import bisect_right
|
|
21
|
+
from dataclasses import dataclass, field
|
|
21
22
|
from logging import ERROR, WARNING
|
|
22
23
|
from typing import Optional
|
|
23
24
|
from uuid import UUID, uuid4
|
|
@@ -43,7 +44,7 @@ from .utils import (
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
@dataclass
|
|
46
|
-
class RunRecord:
|
|
47
|
+
class RunRecord: # pylint: disable=R0902
|
|
47
48
|
"""The record of a specific run, including its status and timestamps."""
|
|
48
49
|
|
|
49
50
|
run: Run
|
|
@@ -52,6 +53,8 @@ class RunRecord:
|
|
|
52
53
|
starting_at: str = ""
|
|
53
54
|
running_at: str = ""
|
|
54
55
|
finished_at: str = ""
|
|
56
|
+
logs: list[tuple[float, str]] = field(default_factory=list)
|
|
57
|
+
log_lock: threading.Lock = field(default_factory=threading.Lock)
|
|
55
58
|
|
|
56
59
|
|
|
57
60
|
class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
@@ -511,3 +514,26 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
|
511
514
|
if run_id not in self.run_ids:
|
|
512
515
|
raise ValueError(f"Run {run_id} not found")
|
|
513
516
|
self.contexts[run_id] = context
|
|
517
|
+
|
|
518
|
+
def add_serverapp_log(self, run_id: int, log_message: str) -> None:
|
|
519
|
+
"""Add a log entry to the serverapp logs for the specified `run_id`."""
|
|
520
|
+
if run_id not in self.run_ids:
|
|
521
|
+
raise ValueError(f"Run {run_id} not found")
|
|
522
|
+
run = self.run_ids[run_id]
|
|
523
|
+
with run.log_lock:
|
|
524
|
+
run.logs.append((now().timestamp(), log_message))
|
|
525
|
+
|
|
526
|
+
def get_serverapp_log(
|
|
527
|
+
self, run_id: int, after_timestamp: Optional[float]
|
|
528
|
+
) -> tuple[str, float]:
|
|
529
|
+
"""Get the serverapp logs for the specified `run_id`."""
|
|
530
|
+
if run_id not in self.run_ids:
|
|
531
|
+
raise ValueError(f"Run {run_id} not found")
|
|
532
|
+
run = self.run_ids[run_id]
|
|
533
|
+
if after_timestamp is None:
|
|
534
|
+
after_timestamp = 0.0
|
|
535
|
+
with run.log_lock:
|
|
536
|
+
# Find the index where the timestamp would be inserted
|
|
537
|
+
index = bisect_right(run.logs, (after_timestamp, ""))
|
|
538
|
+
latest_timestamp = run.logs[-1][0] if index < len(run.logs) else 0.0
|
|
539
|
+
return "".join(log for _, log in run.logs[index:]), latest_timestamp
|
|
@@ -299,3 +299,38 @@ class LinkState(abc.ABC): # pylint: disable=R0904
|
|
|
299
299
|
context : Context
|
|
300
300
|
The context to be associated with the specified `run_id`.
|
|
301
301
|
"""
|
|
302
|
+
|
|
303
|
+
@abc.abstractmethod
|
|
304
|
+
def add_serverapp_log(self, run_id: int, log_message: str) -> None:
|
|
305
|
+
"""Add a log entry to the ServerApp logs for the specified `run_id`.
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
run_id : int
|
|
310
|
+
The identifier of the run for which to add a log entry.
|
|
311
|
+
log_message : str
|
|
312
|
+
The log entry to be added to the ServerApp logs.
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
@abc.abstractmethod
|
|
316
|
+
def get_serverapp_log(
|
|
317
|
+
self, run_id: int, after_timestamp: Optional[float]
|
|
318
|
+
) -> tuple[str, float]:
|
|
319
|
+
"""Get the ServerApp logs for the specified `run_id`.
|
|
320
|
+
|
|
321
|
+
Parameters
|
|
322
|
+
----------
|
|
323
|
+
run_id : int
|
|
324
|
+
The identifier of the run for which to retrieve the ServerApp logs.
|
|
325
|
+
|
|
326
|
+
after_timestamp : Optional[float]
|
|
327
|
+
Retrieve logs after this timestamp. If set to `None`, retrieve all logs.
|
|
328
|
+
|
|
329
|
+
Returns
|
|
330
|
+
-------
|
|
331
|
+
tuple[str, float]
|
|
332
|
+
A tuple containing:
|
|
333
|
+
- The ServerApp logs associated with the specified `run_id`.
|
|
334
|
+
- The timestamp of the latest log entry in the returned logs.
|
|
335
|
+
Returns `0` if no logs are returned.
|
|
336
|
+
"""
|
|
@@ -99,6 +99,17 @@ CREATE TABLE IF NOT EXISTS run(
|
|
|
99
99
|
);
|
|
100
100
|
"""
|
|
101
101
|
|
|
102
|
+
SQL_CREATE_TABLE_LOGS = """
|
|
103
|
+
CREATE TABLE IF NOT EXISTS logs (
|
|
104
|
+
timestamp REAL,
|
|
105
|
+
run_id INTEGER,
|
|
106
|
+
node_id INTEGER,
|
|
107
|
+
log TEXT,
|
|
108
|
+
PRIMARY KEY (timestamp, run_id, node_id),
|
|
109
|
+
FOREIGN KEY (run_id) REFERENCES run(run_id)
|
|
110
|
+
);
|
|
111
|
+
"""
|
|
112
|
+
|
|
102
113
|
SQL_CREATE_TABLE_CONTEXT = """
|
|
103
114
|
CREATE TABLE IF NOT EXISTS context(
|
|
104
115
|
run_id INTEGER UNIQUE,
|
|
@@ -191,6 +202,7 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
|
191
202
|
|
|
192
203
|
# Create each table if not exists queries
|
|
193
204
|
cur.execute(SQL_CREATE_TABLE_RUN)
|
|
205
|
+
cur.execute(SQL_CREATE_TABLE_LOGS)
|
|
194
206
|
cur.execute(SQL_CREATE_TABLE_CONTEXT)
|
|
195
207
|
cur.execute(SQL_CREATE_TABLE_TASK_INS)
|
|
196
208
|
cur.execute(SQL_CREATE_TABLE_TASK_RES)
|
|
@@ -1015,6 +1027,44 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
|
1015
1027
|
except sqlite3.IntegrityError:
|
|
1016
1028
|
raise ValueError(f"Run {run_id} not found") from None
|
|
1017
1029
|
|
|
1030
|
+
def add_serverapp_log(self, run_id: int, log_message: str) -> None:
|
|
1031
|
+
"""Add a log entry to the ServerApp logs for the specified `run_id`."""
|
|
1032
|
+
# Convert the uint64 value to sint64 for SQLite
|
|
1033
|
+
sint64_run_id = convert_uint64_to_sint64(run_id)
|
|
1034
|
+
|
|
1035
|
+
# Store log
|
|
1036
|
+
try:
|
|
1037
|
+
query = """
|
|
1038
|
+
INSERT INTO logs (timestamp, run_id, node_id, log) VALUES (?, ?, ?, ?);
|
|
1039
|
+
"""
|
|
1040
|
+
self.query(query, (now().timestamp(), sint64_run_id, 0, log_message))
|
|
1041
|
+
except sqlite3.IntegrityError:
|
|
1042
|
+
raise ValueError(f"Run {run_id} not found") from None
|
|
1043
|
+
|
|
1044
|
+
def get_serverapp_log(
|
|
1045
|
+
self, run_id: int, after_timestamp: Optional[float]
|
|
1046
|
+
) -> tuple[str, float]:
|
|
1047
|
+
"""Get the ServerApp logs for the specified `run_id`."""
|
|
1048
|
+
# Convert the uint64 value to sint64 for SQLite
|
|
1049
|
+
sint64_run_id = convert_uint64_to_sint64(run_id)
|
|
1050
|
+
|
|
1051
|
+
# Check if the run_id exists
|
|
1052
|
+
query = "SELECT run_id FROM run WHERE run_id = ?;"
|
|
1053
|
+
if not self.query(query, (sint64_run_id,)):
|
|
1054
|
+
raise ValueError(f"Run {run_id} not found")
|
|
1055
|
+
|
|
1056
|
+
# Retrieve logs
|
|
1057
|
+
if after_timestamp is None:
|
|
1058
|
+
after_timestamp = 0.0
|
|
1059
|
+
query = """
|
|
1060
|
+
SELECT log, timestamp FROM logs
|
|
1061
|
+
WHERE run_id = ? AND node_id = ? AND timestamp > ?;
|
|
1062
|
+
"""
|
|
1063
|
+
rows = self.query(query, (sint64_run_id, 0, after_timestamp))
|
|
1064
|
+
rows.sort(key=lambda x: x["timestamp"])
|
|
1065
|
+
latest_timestamp = rows[-1]["timestamp"] if rows else 0.0
|
|
1066
|
+
return "".join(row["log"] for row in rows), latest_timestamp
|
|
1067
|
+
|
|
1018
1068
|
def get_valid_task_ins(self, task_id: str) -> Optional[dict[str, Any]]:
|
|
1019
1069
|
"""Check if the TaskIns exists and is valid (not expired).
|
|
1020
1070
|
|
|
@@ -421,7 +421,8 @@ def _main_loop(
|
|
|
421
421
|
server_app_run_config = {}
|
|
422
422
|
|
|
423
423
|
# Initialize Driver
|
|
424
|
-
driver = InMemoryDriver(
|
|
424
|
+
driver = InMemoryDriver(state_factory=state_factory)
|
|
425
|
+
driver.init_run(run_id=run.run_id)
|
|
425
426
|
|
|
426
427
|
# Get and run ServerApp thread
|
|
427
428
|
serverapp_th = run_serverapp_th(
|
flwr/superexec/deployment.py
CHANGED
|
@@ -15,14 +15,12 @@
|
|
|
15
15
|
"""Deployment engine executor."""
|
|
16
16
|
|
|
17
17
|
import hashlib
|
|
18
|
-
import subprocess
|
|
19
18
|
from logging import ERROR, INFO
|
|
20
19
|
from pathlib import Path
|
|
21
20
|
from typing import Optional
|
|
22
21
|
|
|
23
22
|
from typing_extensions import override
|
|
24
23
|
|
|
25
|
-
from flwr.cli.install import install_from_fab
|
|
26
24
|
from flwr.common.constant import DRIVER_API_DEFAULT_ADDRESS
|
|
27
25
|
from flwr.common.logger import log
|
|
28
26
|
from flwr.common.typing import Fab, UserConfig
|
|
@@ -30,7 +28,7 @@ from flwr.server.superlink.ffs import Ffs
|
|
|
30
28
|
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
31
29
|
from flwr.server.superlink.linkstate import LinkState, LinkStateFactory
|
|
32
30
|
|
|
33
|
-
from .executor import Executor
|
|
31
|
+
from .executor import Executor
|
|
34
32
|
|
|
35
33
|
|
|
36
34
|
class DeploymentEngine(Executor):
|
|
@@ -143,11 +141,9 @@ class DeploymentEngine(Executor):
|
|
|
143
141
|
fab_file: bytes,
|
|
144
142
|
override_config: UserConfig,
|
|
145
143
|
federation_config: UserConfig,
|
|
146
|
-
) -> Optional[
|
|
144
|
+
) -> Optional[int]:
|
|
147
145
|
"""Start run using the Flower Deployment Engine."""
|
|
148
146
|
try:
|
|
149
|
-
# Install FAB to flwr dir
|
|
150
|
-
install_from_fab(fab_file, None, True)
|
|
151
147
|
|
|
152
148
|
# Call SuperLink to create run
|
|
153
149
|
run_id: int = self._create_run(
|
|
@@ -155,37 +151,7 @@ class DeploymentEngine(Executor):
|
|
|
155
151
|
)
|
|
156
152
|
log(INFO, "Created run %s", str(run_id))
|
|
157
153
|
|
|
158
|
-
|
|
159
|
-
"flower-server-app",
|
|
160
|
-
"--run-id",
|
|
161
|
-
str(run_id),
|
|
162
|
-
"--superlink",
|
|
163
|
-
str(self.superlink),
|
|
164
|
-
]
|
|
165
|
-
|
|
166
|
-
if self.flwr_dir:
|
|
167
|
-
command.append("--flwr-dir")
|
|
168
|
-
command.append(self.flwr_dir)
|
|
169
|
-
|
|
170
|
-
if self.root_certificates is None:
|
|
171
|
-
command.append("--insecure")
|
|
172
|
-
else:
|
|
173
|
-
command.append("--root-certificates")
|
|
174
|
-
command.append(self.root_certificates)
|
|
175
|
-
|
|
176
|
-
# Execute the command
|
|
177
|
-
proc = subprocess.Popen( # pylint: disable=consider-using-with
|
|
178
|
-
command,
|
|
179
|
-
stdout=subprocess.PIPE,
|
|
180
|
-
stderr=subprocess.PIPE,
|
|
181
|
-
text=True,
|
|
182
|
-
)
|
|
183
|
-
log(INFO, "Started run %s", str(run_id))
|
|
184
|
-
|
|
185
|
-
return RunTracker(
|
|
186
|
-
run_id=run_id,
|
|
187
|
-
proc=proc,
|
|
188
|
-
)
|
|
154
|
+
return run_id
|
|
189
155
|
# pylint: disable-next=broad-except
|
|
190
156
|
except Exception as e:
|
|
191
157
|
log(ERROR, "Could not start run: %s", str(e))
|
flwr/superexec/exec_servicer.py
CHANGED
|
@@ -15,10 +15,6 @@
|
|
|
15
15
|
"""SuperExec API servicer."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
import select
|
|
19
|
-
import sys
|
|
20
|
-
import threading
|
|
21
|
-
import time
|
|
22
18
|
from collections.abc import Generator
|
|
23
19
|
from logging import ERROR, INFO
|
|
24
20
|
from typing import Any
|
|
@@ -37,7 +33,7 @@ from flwr.proto.exec_pb2 import ( # pylint: disable=E0611
|
|
|
37
33
|
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
38
34
|
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
39
35
|
|
|
40
|
-
from .executor import Executor
|
|
36
|
+
from .executor import Executor
|
|
41
37
|
|
|
42
38
|
SELECT_TIMEOUT = 1 # Timeout for selecting ready-to-read file descriptors (in seconds)
|
|
43
39
|
|
|
@@ -55,7 +51,6 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
55
51
|
self.ffs_factory = ffs_factory
|
|
56
52
|
self.executor = executor
|
|
57
53
|
self.executor.initialize(linkstate_factory, ffs_factory)
|
|
58
|
-
self.runs: dict[int, RunTracker] = {}
|
|
59
54
|
|
|
60
55
|
def StartRun(
|
|
61
56
|
self, request: StartRunRequest, context: grpc.ServicerContext
|
|
@@ -63,25 +58,17 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
63
58
|
"""Create run ID."""
|
|
64
59
|
log(INFO, "ExecServicer.StartRun")
|
|
65
60
|
|
|
66
|
-
|
|
61
|
+
run_id = self.executor.start_run(
|
|
67
62
|
request.fab.content,
|
|
68
63
|
user_config_from_proto(request.override_config),
|
|
69
64
|
user_config_from_proto(request.federation_config),
|
|
70
65
|
)
|
|
71
66
|
|
|
72
|
-
if
|
|
67
|
+
if run_id is None:
|
|
73
68
|
log(ERROR, "Executor failed to start run")
|
|
74
69
|
return StartRunResponse()
|
|
75
70
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
# Start a background thread to capture the log output
|
|
79
|
-
capture_thread = threading.Thread(
|
|
80
|
-
target=_capture_logs, args=(run,), daemon=True
|
|
81
|
-
)
|
|
82
|
-
capture_thread.start()
|
|
83
|
-
|
|
84
|
-
return StartRunResponse(run_id=run.run_id)
|
|
71
|
+
return StartRunResponse(run_id=run_id)
|
|
85
72
|
|
|
86
73
|
def StreamLogs( # pylint: disable=C0103
|
|
87
74
|
self, request: StreamLogsRequest, context: grpc.ServicerContext
|
|
@@ -89,58 +76,4 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
89
76
|
"""Get logs."""
|
|
90
77
|
log(INFO, "ExecServicer.StreamLogs")
|
|
91
78
|
|
|
92
|
-
|
|
93
|
-
if request.run_id not in self.runs:
|
|
94
|
-
context.abort(grpc.StatusCode.NOT_FOUND, "Run ID not found")
|
|
95
|
-
|
|
96
|
-
last_sent_index = 0
|
|
97
|
-
while context.is_active():
|
|
98
|
-
# Yield n'th row of logs, if n'th row < len(logs)
|
|
99
|
-
logs = self.runs[request.run_id].logs
|
|
100
|
-
for i in range(last_sent_index, len(logs)):
|
|
101
|
-
yield StreamLogsResponse(log_output=logs[i])
|
|
102
|
-
last_sent_index = len(logs)
|
|
103
|
-
|
|
104
|
-
# Wait for and continue to yield more log responses only if the
|
|
105
|
-
# run isn't completed yet. If the run is finished, the entire log
|
|
106
|
-
# is returned at this point and the server ends the stream.
|
|
107
|
-
if self.runs[request.run_id].proc.poll() is not None:
|
|
108
|
-
log(INFO, "All logs for run ID `%s` returned", request.run_id)
|
|
109
|
-
context.set_code(grpc.StatusCode.OK)
|
|
110
|
-
context.cancel()
|
|
111
|
-
|
|
112
|
-
time.sleep(1.0) # Sleep briefly to avoid busy waiting
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def _capture_logs(
|
|
116
|
-
run: RunTracker,
|
|
117
|
-
) -> None:
|
|
118
|
-
while True:
|
|
119
|
-
# Explicitly check if Popen.poll() is None. Required for `pytest`.
|
|
120
|
-
if run.proc.poll() is None:
|
|
121
|
-
# Select streams only when ready to read
|
|
122
|
-
ready_to_read, _, _ = select.select(
|
|
123
|
-
[run.proc.stdout, run.proc.stderr],
|
|
124
|
-
[],
|
|
125
|
-
[],
|
|
126
|
-
SELECT_TIMEOUT,
|
|
127
|
-
)
|
|
128
|
-
# Read from std* and append to RunTracker.logs
|
|
129
|
-
for stream in ready_to_read:
|
|
130
|
-
# Flush stdout to view output in real time
|
|
131
|
-
readline = stream.readline()
|
|
132
|
-
sys.stdout.write(readline)
|
|
133
|
-
sys.stdout.flush()
|
|
134
|
-
# Append to logs
|
|
135
|
-
line = readline.rstrip()
|
|
136
|
-
if line:
|
|
137
|
-
run.logs.append(f"{line}")
|
|
138
|
-
|
|
139
|
-
# Close std* to prevent blocking
|
|
140
|
-
elif run.proc.poll() is not None:
|
|
141
|
-
log(INFO, "Subprocess finished, exiting log capture")
|
|
142
|
-
if run.proc.stdout:
|
|
143
|
-
run.proc.stdout.close()
|
|
144
|
-
if run.proc.stderr:
|
|
145
|
-
run.proc.stderr.close()
|
|
146
|
-
break
|
|
79
|
+
yield StreamLogsResponse()
|
flwr/superexec/executor.py
CHANGED
|
@@ -72,7 +72,7 @@ class Executor(ABC):
|
|
|
72
72
|
fab_file: bytes,
|
|
73
73
|
override_config: UserConfig,
|
|
74
74
|
federation_config: UserConfig,
|
|
75
|
-
) -> Optional[
|
|
75
|
+
) -> Optional[int]:
|
|
76
76
|
"""Start a run using the given Flower FAB ID and version.
|
|
77
77
|
|
|
78
78
|
This method creates a new run on the SuperLink, returns its run_id
|
|
@@ -89,7 +89,6 @@ class Executor(ABC):
|
|
|
89
89
|
|
|
90
90
|
Returns
|
|
91
91
|
-------
|
|
92
|
-
run_id : Optional[
|
|
93
|
-
The run_id
|
|
94
|
-
or `None` if it fails.
|
|
92
|
+
run_id : Optional[int]
|
|
93
|
+
The run_id of the run created by the SuperLink, or `None` if it fails.
|
|
95
94
|
"""
|
flwr/superexec/simulation.py
CHANGED
|
@@ -33,7 +33,7 @@ from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
|
33
33
|
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
34
34
|
from flwr.server.superlink.linkstate.utils import generate_rand_int_from_bytes
|
|
35
35
|
|
|
36
|
-
from .executor import Executor
|
|
36
|
+
from .executor import Executor
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def _user_config_to_str(user_config: UserConfig) -> str:
|
|
@@ -125,7 +125,7 @@ class SimulationEngine(Executor):
|
|
|
125
125
|
fab_file: bytes,
|
|
126
126
|
override_config: UserConfig,
|
|
127
127
|
federation_config: UserConfig,
|
|
128
|
-
) -> Optional[
|
|
128
|
+
) -> Optional[int]:
|
|
129
129
|
"""Start run using the Flower Simulation Engine."""
|
|
130
130
|
if self.num_supernodes is None:
|
|
131
131
|
raise ValueError(
|
|
@@ -199,17 +199,14 @@ class SimulationEngine(Executor):
|
|
|
199
199
|
command.extend(["--run-config", f"{override_config_str}"])
|
|
200
200
|
|
|
201
201
|
# Start Simulation
|
|
202
|
-
|
|
202
|
+
_ = subprocess.Popen( # pylint: disable=consider-using-with
|
|
203
203
|
command,
|
|
204
204
|
text=True,
|
|
205
205
|
)
|
|
206
206
|
|
|
207
207
|
log(INFO, "Started run %s", str(run_id))
|
|
208
208
|
|
|
209
|
-
return
|
|
210
|
-
run_id=run_id,
|
|
211
|
-
proc=proc,
|
|
212
|
-
)
|
|
209
|
+
return run_id
|
|
213
210
|
|
|
214
211
|
# pylint: disable-next=broad-except
|
|
215
212
|
except Exception as e:
|