flwr-nightly 1.13.0.dev20241021__py3-none-any.whl → 1.13.0.dev20241111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flwr-nightly might be problematic. Click here for more details.
- flwr/cli/build.py +2 -2
- flwr/cli/config_utils.py +97 -0
- flwr/cli/log.py +63 -97
- flwr/cli/new/templates/app/code/flwr_tune/dataset.py.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +1 -0
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +1 -1
- flwr/cli/run/run.py +34 -88
- flwr/client/app.py +23 -20
- flwr/client/clientapp/app.py +22 -18
- flwr/client/nodestate/__init__.py +25 -0
- flwr/client/nodestate/in_memory_nodestate.py +38 -0
- flwr/client/nodestate/nodestate.py +30 -0
- flwr/client/nodestate/nodestate_factory.py +37 -0
- flwr/client/{node_state.py → run_info_store.py} +4 -3
- flwr/client/supernode/app.py +6 -8
- flwr/common/args.py +83 -0
- flwr/common/config.py +10 -0
- flwr/common/constant.py +39 -5
- flwr/common/context.py +9 -4
- flwr/common/date.py +3 -3
- flwr/common/logger.py +108 -1
- flwr/common/object_ref.py +47 -16
- flwr/common/serde.py +24 -0
- flwr/common/telemetry.py +0 -6
- flwr/common/typing.py +10 -1
- flwr/proto/exec_pb2.py +14 -17
- flwr/proto/exec_pb2.pyi +14 -22
- flwr/proto/log_pb2.py +29 -0
- flwr/proto/log_pb2.pyi +39 -0
- flwr/proto/log_pb2_grpc.py +4 -0
- flwr/proto/log_pb2_grpc.pyi +4 -0
- flwr/proto/message_pb2.py +8 -8
- flwr/proto/message_pb2.pyi +4 -1
- flwr/proto/run_pb2.py +32 -27
- flwr/proto/run_pb2.pyi +26 -0
- flwr/proto/serverappio_pb2.py +52 -0
- flwr/proto/{driver_pb2.pyi → serverappio_pb2.pyi} +54 -0
- flwr/proto/serverappio_pb2_grpc.py +376 -0
- flwr/proto/serverappio_pb2_grpc.pyi +147 -0
- flwr/proto/simulationio_pb2.py +38 -0
- flwr/proto/simulationio_pb2.pyi +65 -0
- flwr/proto/simulationio_pb2_grpc.py +205 -0
- flwr/proto/simulationio_pb2_grpc.pyi +81 -0
- flwr/server/app.py +272 -105
- flwr/server/driver/driver.py +15 -1
- flwr/server/driver/grpc_driver.py +25 -36
- flwr/server/driver/inmemory_driver.py +6 -16
- flwr/server/run_serverapp.py +29 -23
- flwr/server/{superlink/state → serverapp}/__init__.py +3 -9
- flwr/server/serverapp/app.py +214 -0
- flwr/server/strategy/aggregate.py +4 -4
- flwr/server/strategy/fedadam.py +11 -1
- flwr/server/superlink/driver/__init__.py +1 -1
- flwr/server/superlink/driver/{driver_grpc.py → serverappio_grpc.py} +19 -16
- flwr/server/superlink/driver/{driver_servicer.py → serverappio_servicer.py} +125 -39
- flwr/server/superlink/fleet/grpc_adapter/grpc_adapter_servicer.py +4 -2
- flwr/server/superlink/fleet/grpc_bidi/grpc_server.py +2 -2
- flwr/server/superlink/fleet/grpc_rere/fleet_servicer.py +4 -2
- flwr/server/superlink/fleet/grpc_rere/server_interceptor.py +2 -2
- flwr/server/superlink/fleet/message_handler/message_handler.py +7 -7
- flwr/server/superlink/fleet/rest_rere/rest_api.py +7 -7
- flwr/server/superlink/fleet/vce/vce_api.py +23 -23
- flwr/server/superlink/linkstate/__init__.py +28 -0
- flwr/server/superlink/{state/in_memory_state.py → linkstate/in_memory_linkstate.py} +184 -36
- flwr/server/superlink/{state/state.py → linkstate/linkstate.py} +149 -19
- flwr/server/superlink/{state/state_factory.py → linkstate/linkstate_factory.py} +9 -9
- flwr/server/superlink/{state/sqlite_state.py → linkstate/sqlite_linkstate.py} +306 -65
- flwr/server/superlink/{state → linkstate}/utils.py +81 -30
- flwr/server/superlink/simulation/__init__.py +15 -0
- flwr/server/superlink/simulation/simulationio_grpc.py +65 -0
- flwr/server/superlink/simulation/simulationio_servicer.py +153 -0
- flwr/simulation/__init__.py +5 -1
- flwr/simulation/app.py +273 -345
- flwr/simulation/legacy_app.py +382 -0
- flwr/simulation/ray_transport/ray_client_proxy.py +2 -2
- flwr/simulation/run_simulation.py +57 -131
- flwr/simulation/simulationio_connection.py +86 -0
- flwr/superexec/app.py +6 -134
- flwr/superexec/deployment.py +61 -66
- flwr/superexec/exec_grpc.py +15 -8
- flwr/superexec/exec_servicer.py +36 -65
- flwr/superexec/executor.py +26 -7
- flwr/superexec/simulation.py +54 -107
- {flwr_nightly-1.13.0.dev20241021.dist-info → flwr_nightly-1.13.0.dev20241111.dist-info}/METADATA +5 -4
- {flwr_nightly-1.13.0.dev20241021.dist-info → flwr_nightly-1.13.0.dev20241111.dist-info}/RECORD +88 -69
- {flwr_nightly-1.13.0.dev20241021.dist-info → flwr_nightly-1.13.0.dev20241111.dist-info}/entry_points.txt +2 -0
- flwr/client/node_state_tests.py +0 -66
- flwr/proto/driver_pb2.py +0 -42
- flwr/proto/driver_pb2_grpc.py +0 -239
- flwr/proto/driver_pb2_grpc.pyi +0 -94
- {flwr_nightly-1.13.0.dev20241021.dist-info → flwr_nightly-1.13.0.dev20241111.dist-info}/LICENSE +0 -0
- {flwr_nightly-1.13.0.dev20241021.dist-info → flwr_nightly-1.13.0.dev20241111.dist-info}/WHEEL +0 -0
flwr/superexec/deployment.py
CHANGED
|
@@ -15,23 +15,21 @@
|
|
|
15
15
|
"""Deployment engine executor."""
|
|
16
16
|
|
|
17
17
|
import hashlib
|
|
18
|
-
import subprocess
|
|
19
18
|
from logging import ERROR, INFO
|
|
20
19
|
from pathlib import Path
|
|
21
20
|
from typing import Optional
|
|
22
21
|
|
|
23
22
|
from typing_extensions import override
|
|
24
23
|
|
|
25
|
-
from flwr.
|
|
26
|
-
from flwr.common.constant import
|
|
27
|
-
from flwr.common.grpc import create_channel
|
|
24
|
+
from flwr.common import ConfigsRecord, Context, RecordSet
|
|
25
|
+
from flwr.common.constant import SERVERAPPIO_API_DEFAULT_ADDRESS, Status, SubStatus
|
|
28
26
|
from flwr.common.logger import log
|
|
29
|
-
from flwr.common.
|
|
30
|
-
from flwr.
|
|
31
|
-
from flwr.
|
|
32
|
-
from flwr.
|
|
27
|
+
from flwr.common.typing import Fab, RunStatus, UserConfig
|
|
28
|
+
from flwr.server.superlink.ffs import Ffs
|
|
29
|
+
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
30
|
+
from flwr.server.superlink.linkstate import LinkState, LinkStateFactory
|
|
33
31
|
|
|
34
|
-
from .executor import Executor
|
|
32
|
+
from .executor import Executor
|
|
35
33
|
|
|
36
34
|
|
|
37
35
|
class DeploymentEngine(Executor):
|
|
@@ -50,7 +48,7 @@ class DeploymentEngine(Executor):
|
|
|
50
48
|
|
|
51
49
|
def __init__(
|
|
52
50
|
self,
|
|
53
|
-
superlink: str =
|
|
51
|
+
superlink: str = SERVERAPPIO_API_DEFAULT_ADDRESS,
|
|
54
52
|
root_certificates: Optional[str] = None,
|
|
55
53
|
flwr_dir: Optional[str] = None,
|
|
56
54
|
) -> None:
|
|
@@ -62,7 +60,30 @@ class DeploymentEngine(Executor):
|
|
|
62
60
|
self.root_certificates = root_certificates
|
|
63
61
|
self.root_certificates_bytes = Path(root_certificates).read_bytes()
|
|
64
62
|
self.flwr_dir = flwr_dir
|
|
65
|
-
self.
|
|
63
|
+
self.linkstate_factory: Optional[LinkStateFactory] = None
|
|
64
|
+
self.ffs_factory: Optional[FfsFactory] = None
|
|
65
|
+
|
|
66
|
+
@override
|
|
67
|
+
def initialize(
|
|
68
|
+
self, linkstate_factory: LinkStateFactory, ffs_factory: FfsFactory
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Initialize the executor with the necessary factories."""
|
|
71
|
+
self.linkstate_factory = linkstate_factory
|
|
72
|
+
self.ffs_factory = ffs_factory
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def linkstate(self) -> LinkState:
|
|
76
|
+
"""Return the LinkState."""
|
|
77
|
+
if self.linkstate_factory is None:
|
|
78
|
+
raise RuntimeError("Executor is not initialized.")
|
|
79
|
+
return self.linkstate_factory.state()
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def ffs(self) -> Ffs:
|
|
83
|
+
"""Return the Flower File Storage (FFS)."""
|
|
84
|
+
if self.ffs_factory is None:
|
|
85
|
+
raise RuntimeError("Executor is not initialized.")
|
|
86
|
+
return self.ffs_factory.ffs()
|
|
66
87
|
|
|
67
88
|
@override
|
|
68
89
|
def set_config(
|
|
@@ -77,7 +98,7 @@ class DeploymentEngine(Executor):
|
|
|
77
98
|
A dictionary for configuration values.
|
|
78
99
|
Supported configuration key/value pairs:
|
|
79
100
|
- "superlink": str
|
|
80
|
-
The address of the SuperLink
|
|
101
|
+
The address of the SuperLink ServerAppIo API.
|
|
81
102
|
- "root-certificates": str
|
|
82
103
|
The path to the root certificates.
|
|
83
104
|
- "flwr-dir": str
|
|
@@ -101,85 +122,59 @@ class DeploymentEngine(Executor):
|
|
|
101
122
|
raise ValueError("The `flwr-dir` value should be of type `str`.")
|
|
102
123
|
self.flwr_dir = str(flwr_dir)
|
|
103
124
|
|
|
104
|
-
def _connect(self) -> None:
|
|
105
|
-
if self.stub is not None:
|
|
106
|
-
return
|
|
107
|
-
channel = create_channel(
|
|
108
|
-
server_address=self.superlink,
|
|
109
|
-
insecure=(self.root_certificates_bytes is None),
|
|
110
|
-
root_certificates=self.root_certificates_bytes,
|
|
111
|
-
)
|
|
112
|
-
self.stub = DriverStub(channel)
|
|
113
|
-
|
|
114
125
|
def _create_run(
|
|
115
126
|
self,
|
|
116
127
|
fab: Fab,
|
|
117
128
|
override_config: UserConfig,
|
|
118
129
|
) -> int:
|
|
119
|
-
|
|
120
|
-
|
|
130
|
+
fab_hash = self.ffs.put(fab.content, {})
|
|
131
|
+
if fab_hash != fab.hash_str:
|
|
132
|
+
raise RuntimeError(
|
|
133
|
+
f"FAB ({fab.hash_str}) hash from request doesn't match contents"
|
|
134
|
+
)
|
|
121
135
|
|
|
122
|
-
|
|
136
|
+
run_id = self.linkstate.create_run(
|
|
137
|
+
None, None, fab_hash, override_config, ConfigsRecord()
|
|
138
|
+
)
|
|
139
|
+
return run_id
|
|
123
140
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
141
|
+
def _create_context(self, run_id: int) -> None:
|
|
142
|
+
"""Register a Context for a Run."""
|
|
143
|
+
# Create an empty context for the Run
|
|
144
|
+
context = Context(
|
|
145
|
+
run_id=run_id, node_id=0, node_config={}, state=RecordSet(), run_config={}
|
|
127
146
|
)
|
|
128
|
-
|
|
129
|
-
|
|
147
|
+
|
|
148
|
+
# Register the context at the LinkState
|
|
149
|
+
self.linkstate.set_serverapp_context(run_id=run_id, context=context)
|
|
130
150
|
|
|
131
151
|
@override
|
|
132
152
|
def start_run(
|
|
133
153
|
self,
|
|
134
154
|
fab_file: bytes,
|
|
135
155
|
override_config: UserConfig,
|
|
136
|
-
|
|
137
|
-
) -> Optional[
|
|
156
|
+
federation_options: ConfigsRecord,
|
|
157
|
+
) -> Optional[int]:
|
|
138
158
|
"""Start run using the Flower Deployment Engine."""
|
|
159
|
+
run_id = None
|
|
139
160
|
try:
|
|
140
|
-
# Install FAB to flwr dir
|
|
141
|
-
install_from_fab(fab_file, None, True)
|
|
142
161
|
|
|
143
162
|
# Call SuperLink to create run
|
|
144
|
-
run_id
|
|
163
|
+
run_id = self._create_run(
|
|
145
164
|
Fab(hashlib.sha256(fab_file).hexdigest(), fab_file), override_config
|
|
146
165
|
)
|
|
147
|
-
log(INFO, "Created run %s", str(run_id))
|
|
148
166
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
str(run_id),
|
|
153
|
-
"--superlink",
|
|
154
|
-
str(self.superlink),
|
|
155
|
-
]
|
|
156
|
-
|
|
157
|
-
if self.flwr_dir:
|
|
158
|
-
command.append("--flwr-dir")
|
|
159
|
-
command.append(self.flwr_dir)
|
|
160
|
-
|
|
161
|
-
if self.root_certificates is None:
|
|
162
|
-
command.append("--insecure")
|
|
163
|
-
else:
|
|
164
|
-
command.append("--root-certificates")
|
|
165
|
-
command.append(self.root_certificates)
|
|
166
|
-
|
|
167
|
-
# Execute the command
|
|
168
|
-
proc = subprocess.Popen( # pylint: disable=consider-using-with
|
|
169
|
-
command,
|
|
170
|
-
stdout=subprocess.PIPE,
|
|
171
|
-
stderr=subprocess.PIPE,
|
|
172
|
-
text=True,
|
|
173
|
-
)
|
|
174
|
-
log(INFO, "Started run %s", str(run_id))
|
|
167
|
+
# Register context for the Run
|
|
168
|
+
self._create_context(run_id=run_id)
|
|
169
|
+
log(INFO, "Created run %s", str(run_id))
|
|
175
170
|
|
|
176
|
-
return
|
|
177
|
-
run_id=run_id,
|
|
178
|
-
proc=proc,
|
|
179
|
-
)
|
|
171
|
+
return run_id
|
|
180
172
|
# pylint: disable-next=broad-except
|
|
181
173
|
except Exception as e:
|
|
182
174
|
log(ERROR, "Could not start run: %s", str(e))
|
|
175
|
+
if run_id:
|
|
176
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(e))
|
|
177
|
+
self.linkstate.update_run_status(run_id, new_status=run_status)
|
|
183
178
|
return None
|
|
184
179
|
|
|
185
180
|
|
flwr/superexec/exec_grpc.py
CHANGED
|
@@ -23,33 +23,40 @@ from flwr.common import GRPC_MAX_MESSAGE_LENGTH
|
|
|
23
23
|
from flwr.common.logger import log
|
|
24
24
|
from flwr.common.typing import UserConfig
|
|
25
25
|
from flwr.proto.exec_pb2_grpc import add_ExecServicer_to_server
|
|
26
|
+
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
26
27
|
from flwr.server.superlink.fleet.grpc_bidi.grpc_server import generic_create_grpc_server
|
|
28
|
+
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
27
29
|
|
|
28
30
|
from .exec_servicer import ExecServicer
|
|
29
31
|
from .executor import Executor
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
# pylint: disable-next=too-many-arguments, too-many-positional-arguments
|
|
35
|
+
def run_exec_api_grpc(
|
|
33
36
|
address: str,
|
|
34
37
|
executor: Executor,
|
|
38
|
+
state_factory: LinkStateFactory,
|
|
39
|
+
ffs_factory: FfsFactory,
|
|
35
40
|
certificates: Optional[tuple[bytes, bytes, bytes]],
|
|
36
41
|
config: UserConfig,
|
|
37
42
|
) -> grpc.Server:
|
|
38
|
-
"""Run
|
|
43
|
+
"""Run Exec API (gRPC, request-response)."""
|
|
39
44
|
executor.set_config(config)
|
|
40
45
|
|
|
41
46
|
exec_servicer: grpc.Server = ExecServicer(
|
|
47
|
+
linkstate_factory=state_factory,
|
|
48
|
+
ffs_factory=ffs_factory,
|
|
42
49
|
executor=executor,
|
|
43
50
|
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
servicer_and_add_fn=(exec_servicer,
|
|
51
|
+
exec_add_servicer_to_server_fn = add_ExecServicer_to_server
|
|
52
|
+
exec_grpc_server = generic_create_grpc_server(
|
|
53
|
+
servicer_and_add_fn=(exec_servicer, exec_add_servicer_to_server_fn),
|
|
47
54
|
server_address=address,
|
|
48
55
|
max_message_length=GRPC_MAX_MESSAGE_LENGTH,
|
|
49
56
|
certificates=certificates,
|
|
50
57
|
)
|
|
51
58
|
|
|
52
|
-
log(INFO, "
|
|
53
|
-
|
|
59
|
+
log(INFO, "Flower Deployment Engine: Starting Exec API on %s", address)
|
|
60
|
+
exec_grpc_server.start()
|
|
54
61
|
|
|
55
|
-
return
|
|
62
|
+
return exec_grpc_server
|
flwr/superexec/exec_servicer.py
CHANGED
|
@@ -15,9 +15,6 @@
|
|
|
15
15
|
"""SuperExec API servicer."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
import select
|
|
19
|
-
import sys
|
|
20
|
-
import threading
|
|
21
18
|
import time
|
|
22
19
|
from collections.abc import Generator
|
|
23
20
|
from logging import ERROR, INFO
|
|
@@ -25,8 +22,9 @@ from typing import Any
|
|
|
25
22
|
|
|
26
23
|
import grpc
|
|
27
24
|
|
|
25
|
+
from flwr.common.constant import LOG_STREAM_INTERVAL, Status
|
|
28
26
|
from flwr.common.logger import log
|
|
29
|
-
from flwr.common.serde import user_config_from_proto
|
|
27
|
+
from flwr.common.serde import configs_record_from_proto, user_config_from_proto
|
|
30
28
|
from flwr.proto import exec_pb2_grpc # pylint: disable=E0611
|
|
31
29
|
from flwr.proto.exec_pb2 import ( # pylint: disable=E0611
|
|
32
30
|
StartRunRequest,
|
|
@@ -34,18 +32,25 @@ from flwr.proto.exec_pb2 import ( # pylint: disable=E0611
|
|
|
34
32
|
StreamLogsRequest,
|
|
35
33
|
StreamLogsResponse,
|
|
36
34
|
)
|
|
35
|
+
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
36
|
+
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
37
37
|
|
|
38
|
-
from .executor import Executor
|
|
39
|
-
|
|
40
|
-
SELECT_TIMEOUT = 1 # Timeout for selecting ready-to-read file descriptors (in seconds)
|
|
38
|
+
from .executor import Executor
|
|
41
39
|
|
|
42
40
|
|
|
43
41
|
class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
44
42
|
"""SuperExec API servicer."""
|
|
45
43
|
|
|
46
|
-
def __init__(
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
linkstate_factory: LinkStateFactory,
|
|
47
|
+
ffs_factory: FfsFactory,
|
|
48
|
+
executor: Executor,
|
|
49
|
+
) -> None:
|
|
50
|
+
self.linkstate_factory = linkstate_factory
|
|
51
|
+
self.ffs_factory = ffs_factory
|
|
47
52
|
self.executor = executor
|
|
48
|
-
self.
|
|
53
|
+
self.executor.initialize(linkstate_factory, ffs_factory)
|
|
49
54
|
|
|
50
55
|
def StartRun(
|
|
51
56
|
self, request: StartRunRequest, context: grpc.ServicerContext
|
|
@@ -53,84 +58,50 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
53
58
|
"""Create run ID."""
|
|
54
59
|
log(INFO, "ExecServicer.StartRun")
|
|
55
60
|
|
|
56
|
-
|
|
61
|
+
run_id = self.executor.start_run(
|
|
57
62
|
request.fab.content,
|
|
58
63
|
user_config_from_proto(request.override_config),
|
|
59
|
-
|
|
64
|
+
configs_record_from_proto(request.federation_options),
|
|
60
65
|
)
|
|
61
66
|
|
|
62
|
-
if
|
|
67
|
+
if run_id is None:
|
|
63
68
|
log(ERROR, "Executor failed to start run")
|
|
64
69
|
return StartRunResponse()
|
|
65
70
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
# Start a background thread to capture the log output
|
|
69
|
-
capture_thread = threading.Thread(
|
|
70
|
-
target=_capture_logs, args=(run,), daemon=True
|
|
71
|
-
)
|
|
72
|
-
capture_thread.start()
|
|
73
|
-
|
|
74
|
-
return StartRunResponse(run_id=run.run_id)
|
|
71
|
+
return StartRunResponse(run_id=run_id)
|
|
75
72
|
|
|
76
73
|
def StreamLogs( # pylint: disable=C0103
|
|
77
74
|
self, request: StreamLogsRequest, context: grpc.ServicerContext
|
|
78
75
|
) -> Generator[StreamLogsResponse, Any, None]:
|
|
79
76
|
"""Get logs."""
|
|
80
77
|
log(INFO, "ExecServicer.StreamLogs")
|
|
78
|
+
state = self.linkstate_factory.state()
|
|
79
|
+
|
|
80
|
+
# Retrieve run ID
|
|
81
|
+
run_id = request.run_id
|
|
81
82
|
|
|
82
83
|
# Exit if `run_id` not found
|
|
83
|
-
if
|
|
84
|
+
if not state.get_run(run_id):
|
|
84
85
|
context.abort(grpc.StatusCode.NOT_FOUND, "Run ID not found")
|
|
85
86
|
|
|
86
|
-
|
|
87
|
+
after_timestamp = request.after_timestamp + 1e-6
|
|
87
88
|
while context.is_active():
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
89
|
+
log_msg, latest_timestamp = state.get_serverapp_log(run_id, after_timestamp)
|
|
90
|
+
if log_msg:
|
|
91
|
+
yield StreamLogsResponse(
|
|
92
|
+
log_output=log_msg,
|
|
93
|
+
latest_timestamp=latest_timestamp,
|
|
94
|
+
)
|
|
95
|
+
# Add a small epsilon to the latest timestamp to avoid getting
|
|
96
|
+
# the same log
|
|
97
|
+
after_timestamp = max(latest_timestamp + 1e-6, after_timestamp)
|
|
93
98
|
|
|
94
99
|
# Wait for and continue to yield more log responses only if the
|
|
95
100
|
# run isn't completed yet. If the run is finished, the entire log
|
|
96
101
|
# is returned at this point and the server ends the stream.
|
|
97
|
-
|
|
102
|
+
run_status = state.get_run_status({run_id})[run_id]
|
|
103
|
+
if run_status.status == Status.FINISHED:
|
|
98
104
|
log(INFO, "All logs for run ID `%s` returned", request.run_id)
|
|
99
|
-
context.set_code(grpc.StatusCode.OK)
|
|
100
105
|
context.cancel()
|
|
101
106
|
|
|
102
|
-
time.sleep(
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def _capture_logs(
|
|
106
|
-
run: RunTracker,
|
|
107
|
-
) -> None:
|
|
108
|
-
while True:
|
|
109
|
-
# Explicitly check if Popen.poll() is None. Required for `pytest`.
|
|
110
|
-
if run.proc.poll() is None:
|
|
111
|
-
# Select streams only when ready to read
|
|
112
|
-
ready_to_read, _, _ = select.select(
|
|
113
|
-
[run.proc.stdout, run.proc.stderr],
|
|
114
|
-
[],
|
|
115
|
-
[],
|
|
116
|
-
SELECT_TIMEOUT,
|
|
117
|
-
)
|
|
118
|
-
# Read from std* and append to RunTracker.logs
|
|
119
|
-
for stream in ready_to_read:
|
|
120
|
-
# Flush stdout to view output in real time
|
|
121
|
-
readline = stream.readline()
|
|
122
|
-
sys.stdout.write(readline)
|
|
123
|
-
sys.stdout.flush()
|
|
124
|
-
# Append to logs
|
|
125
|
-
line = readline.rstrip()
|
|
126
|
-
if line:
|
|
127
|
-
run.logs.append(f"{line}")
|
|
128
|
-
|
|
129
|
-
# Close std* to prevent blocking
|
|
130
|
-
elif run.proc.poll() is not None:
|
|
131
|
-
log(INFO, "Subprocess finished, exiting log capture")
|
|
132
|
-
if run.proc.stdout:
|
|
133
|
-
run.proc.stdout.close()
|
|
134
|
-
if run.proc.stderr:
|
|
135
|
-
run.proc.stderr.close()
|
|
136
|
-
break
|
|
107
|
+
time.sleep(LOG_STREAM_INTERVAL) # Sleep briefly to avoid busy waiting
|
flwr/superexec/executor.py
CHANGED
|
@@ -19,7 +19,10 @@ from dataclasses import dataclass, field
|
|
|
19
19
|
from subprocess import Popen
|
|
20
20
|
from typing import Optional
|
|
21
21
|
|
|
22
|
+
from flwr.common import ConfigsRecord
|
|
22
23
|
from flwr.common.typing import UserConfig
|
|
24
|
+
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
25
|
+
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
23
26
|
|
|
24
27
|
|
|
25
28
|
@dataclass
|
|
@@ -34,6 +37,23 @@ class RunTracker:
|
|
|
34
37
|
class Executor(ABC):
|
|
35
38
|
"""Execute and monitor a Flower run."""
|
|
36
39
|
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def initialize(
|
|
42
|
+
self, linkstate_factory: LinkStateFactory, ffs_factory: FfsFactory
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Initialize the executor with the necessary factories.
|
|
45
|
+
|
|
46
|
+
This method sets up the executor by providing it with the factories required
|
|
47
|
+
to access the LinkState and the Flower File Storage (FFS) in the SuperLink.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
linkstate_factory : LinkStateFactory
|
|
52
|
+
The factory to create access to the LinkState.
|
|
53
|
+
ffs_factory : FfsFactory
|
|
54
|
+
The factory to create access to the Flower File Storage (FFS).
|
|
55
|
+
"""
|
|
56
|
+
|
|
37
57
|
@abstractmethod
|
|
38
58
|
def set_config(
|
|
39
59
|
self,
|
|
@@ -52,8 +72,8 @@ class Executor(ABC):
|
|
|
52
72
|
self,
|
|
53
73
|
fab_file: bytes,
|
|
54
74
|
override_config: UserConfig,
|
|
55
|
-
|
|
56
|
-
) -> Optional[
|
|
75
|
+
federation_options: ConfigsRecord,
|
|
76
|
+
) -> Optional[int]:
|
|
57
77
|
"""Start a run using the given Flower FAB ID and version.
|
|
58
78
|
|
|
59
79
|
This method creates a new run on the SuperLink, returns its run_id
|
|
@@ -65,12 +85,11 @@ class Executor(ABC):
|
|
|
65
85
|
The Flower App Bundle file bytes.
|
|
66
86
|
override_config: UserConfig
|
|
67
87
|
The config overrides dict sent by the user (using `flwr run`).
|
|
68
|
-
|
|
69
|
-
The federation options
|
|
88
|
+
federation_options: ConfigsRecord
|
|
89
|
+
The federation options sent by the user (using `flwr run`).
|
|
70
90
|
|
|
71
91
|
Returns
|
|
72
92
|
-------
|
|
73
|
-
run_id : Optional[
|
|
74
|
-
The run_id
|
|
75
|
-
or `None` if it fails.
|
|
93
|
+
run_id : Optional[int]
|
|
94
|
+
The run_id of the run created by the SuperLink, or `None` if it fails.
|
|
76
95
|
"""
|
flwr/superexec/simulation.py
CHANGED
|
@@ -15,42 +15,20 @@
|
|
|
15
15
|
"""Simulation engine executor."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
import
|
|
19
|
-
import
|
|
20
|
-
import sys
|
|
21
|
-
from logging import ERROR, INFO, WARN
|
|
18
|
+
import hashlib
|
|
19
|
+
from logging import ERROR, INFO
|
|
22
20
|
from typing import Optional
|
|
23
21
|
|
|
24
22
|
from typing_extensions import override
|
|
25
23
|
|
|
26
|
-
from flwr.
|
|
27
|
-
from flwr.cli.install import install_from_fab
|
|
28
|
-
from flwr.common.config import unflatten_dict
|
|
29
|
-
from flwr.common.constant import RUN_ID_NUM_BYTES
|
|
24
|
+
from flwr.common import ConfigsRecord, Context, RecordSet
|
|
30
25
|
from flwr.common.logger import log
|
|
31
|
-
from flwr.common.typing import UserConfig
|
|
32
|
-
from flwr.server.superlink.
|
|
33
|
-
|
|
34
|
-
from .
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _user_config_to_str(user_config: UserConfig) -> str:
|
|
38
|
-
"""Convert override user config to string."""
|
|
39
|
-
user_config_list_str = []
|
|
40
|
-
for key, value in user_config.items():
|
|
41
|
-
if isinstance(value, bool):
|
|
42
|
-
user_config_list_str.append(f"{key}={str(value).lower()}")
|
|
43
|
-
elif isinstance(value, (int, float)):
|
|
44
|
-
user_config_list_str.append(f"{key}={value}")
|
|
45
|
-
elif isinstance(value, str):
|
|
46
|
-
user_config_list_str.append(f'{key}="{value}"')
|
|
47
|
-
else:
|
|
48
|
-
raise ValueError(
|
|
49
|
-
"Only types `bool`, `float`, `int` and `str` are supported"
|
|
50
|
-
)
|
|
26
|
+
from flwr.common.typing import Fab, UserConfig
|
|
27
|
+
from flwr.server.superlink.ffs import Ffs
|
|
28
|
+
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
29
|
+
from flwr.server.superlink.linkstate import LinkState, LinkStateFactory
|
|
51
30
|
|
|
52
|
-
|
|
53
|
-
return user_config_str
|
|
31
|
+
from .executor import Executor
|
|
54
32
|
|
|
55
33
|
|
|
56
34
|
class SimulationEngine(Executor):
|
|
@@ -69,6 +47,30 @@ class SimulationEngine(Executor):
|
|
|
69
47
|
) -> None:
|
|
70
48
|
self.num_supernodes = num_supernodes
|
|
71
49
|
self.verbose = verbose
|
|
50
|
+
self.linkstate_factory: Optional[LinkStateFactory] = None
|
|
51
|
+
self.ffs_factory: Optional[FfsFactory] = None
|
|
52
|
+
|
|
53
|
+
@override
|
|
54
|
+
def initialize(
|
|
55
|
+
self, linkstate_factory: LinkStateFactory, ffs_factory: FfsFactory
|
|
56
|
+
) -> None:
|
|
57
|
+
"""Initialize the executor with the necessary factories."""
|
|
58
|
+
self.linkstate_factory = linkstate_factory
|
|
59
|
+
self.ffs_factory = ffs_factory
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def linkstate(self) -> LinkState:
|
|
63
|
+
"""Return the LinkState."""
|
|
64
|
+
if self.linkstate_factory is None:
|
|
65
|
+
raise RuntimeError("Executor is not initialized.")
|
|
66
|
+
return self.linkstate_factory.state()
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def ffs(self) -> Ffs:
|
|
70
|
+
"""Return the Flower File Storage (FFS)."""
|
|
71
|
+
if self.ffs_factory is None:
|
|
72
|
+
raise RuntimeError("Executor is not initialized.")
|
|
73
|
+
return self.ffs_factory.ffs()
|
|
72
74
|
|
|
73
75
|
@override
|
|
74
76
|
def set_config(
|
|
@@ -116,92 +118,37 @@ class SimulationEngine(Executor):
|
|
|
116
118
|
self,
|
|
117
119
|
fab_file: bytes,
|
|
118
120
|
override_config: UserConfig,
|
|
119
|
-
|
|
120
|
-
) -> Optional[
|
|
121
|
+
federation_options: ConfigsRecord,
|
|
122
|
+
) -> Optional[int]:
|
|
121
123
|
"""Start run using the Flower Simulation Engine."""
|
|
122
|
-
if self.num_supernodes is None:
|
|
123
|
-
raise ValueError(
|
|
124
|
-
"Error in `SuperExec` (`SimulationEngine` executor):\n\n"
|
|
125
|
-
"`num-supernodes` must not be `None`, it must be a valid "
|
|
126
|
-
"positive integer. In order to start this simulation executor "
|
|
127
|
-
"with a specified number of `SuperNodes`, you can either provide "
|
|
128
|
-
"a `--executor` that has been initialized with a number of nodes "
|
|
129
|
-
"to the `flower-superexec` CLI, or `--executor-config num-supernodes=N`"
|
|
130
|
-
"to the `flower-superexec` CLI."
|
|
131
|
-
)
|
|
132
124
|
try:
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
[sys.executable, "-m", "pip", "install", "--no-deps", str(fab_path)],
|
|
140
|
-
stdout=None if self.verbose else subprocess.DEVNULL,
|
|
141
|
-
stderr=None if self.verbose else subprocess.DEVNULL,
|
|
142
|
-
check=True,
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Load and validate config
|
|
146
|
-
config, errors, warnings = load_and_validate(fab_path / "pyproject.toml")
|
|
147
|
-
if errors:
|
|
148
|
-
raise ValueError(errors)
|
|
149
|
-
|
|
150
|
-
if warnings:
|
|
151
|
-
log(WARN, warnings)
|
|
152
|
-
|
|
153
|
-
if config is None:
|
|
154
|
-
raise ValueError(
|
|
155
|
-
"Config extracted from FAB's pyproject.toml is not valid"
|
|
125
|
+
# Create run
|
|
126
|
+
fab = Fab(hashlib.sha256(fab_file).hexdigest(), fab_file)
|
|
127
|
+
fab_hash = self.ffs.put(fab.content, {})
|
|
128
|
+
if fab_hash != fab.hash_str:
|
|
129
|
+
raise RuntimeError(
|
|
130
|
+
f"FAB ({fab.hash_str}) hash from request doesn't match contents"
|
|
156
131
|
)
|
|
157
132
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
num_supernodes = federation_config_flat.get(
|
|
162
|
-
"num-supernodes", self.num_supernodes
|
|
133
|
+
run_id = self.linkstate.create_run(
|
|
134
|
+
None, None, fab_hash, override_config, federation_options
|
|
163
135
|
)
|
|
164
|
-
backend_cfg = federation_config_flat.get("backend", {})
|
|
165
|
-
verbose: Optional[bool] = federation_config_flat.get("verbose")
|
|
166
|
-
|
|
167
|
-
# In Simulation there is no SuperLink, still we create a run_id
|
|
168
|
-
run_id = generate_rand_int_from_bytes(RUN_ID_NUM_BYTES)
|
|
169
|
-
log(INFO, "Created run %s", str(run_id))
|
|
170
136
|
|
|
171
|
-
#
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
"--run-id",
|
|
179
|
-
str(run_id),
|
|
180
|
-
]
|
|
181
|
-
|
|
182
|
-
if backend_cfg:
|
|
183
|
-
# Stringify as JSON
|
|
184
|
-
command.extend(["--backend-config", json.dumps(backend_cfg)])
|
|
185
|
-
|
|
186
|
-
if verbose:
|
|
187
|
-
command.extend(["--verbose"])
|
|
188
|
-
|
|
189
|
-
if override_config:
|
|
190
|
-
override_config_str = _user_config_to_str(override_config)
|
|
191
|
-
command.extend(["--run-config", f"{override_config_str}"])
|
|
192
|
-
|
|
193
|
-
# Start Simulation
|
|
194
|
-
proc = subprocess.Popen( # pylint: disable=consider-using-with
|
|
195
|
-
command,
|
|
196
|
-
text=True,
|
|
137
|
+
# Create an empty context for the Run
|
|
138
|
+
context = Context(
|
|
139
|
+
run_id=run_id,
|
|
140
|
+
node_id=0,
|
|
141
|
+
node_config={},
|
|
142
|
+
state=RecordSet(),
|
|
143
|
+
run_config={},
|
|
197
144
|
)
|
|
198
145
|
|
|
199
|
-
|
|
146
|
+
# Register the context at the LinkState
|
|
147
|
+
self.linkstate.set_serverapp_context(run_id=run_id, context=context)
|
|
200
148
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
)
|
|
149
|
+
log(INFO, "Created run %s", str(run_id))
|
|
150
|
+
|
|
151
|
+
return run_id
|
|
205
152
|
|
|
206
153
|
# pylint: disable-next=broad-except
|
|
207
154
|
except Exception as e:
|