@fluidframework/azure-end-to-end-tests 2.70.0-361092 → 2.70.0-361788

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,11 +13,20 @@ export interface UserIdAndName {
13
13
  name: string;
14
14
  }
15
15
 
16
+ export interface EventEntry {
17
+ timestamp: number;
18
+ agentId: string;
19
+ eventCategory: string;
20
+ eventName: string;
21
+ details?: string;
22
+ }
23
+
16
24
  /**
17
25
  * Message types sent from the orchestrator to the child processes
18
26
  */
19
27
  export type MessageToChild =
20
28
  | ConnectCommand
29
+ | DebugReportCommand
21
30
  | DisconnectSelfCommand
22
31
  | RegisterWorkspaceCommand
23
32
  | GetLatestValueCommand
@@ -50,6 +59,24 @@ export interface ConnectCommand {
50
59
  containerId?: string;
51
60
  }
52
61
 
62
+ /**
63
+ * Instructs a child process to report debug information.
64
+ *
65
+ * @privateRemarks
66
+ * This can be expanded over time to include more options.
67
+ */
68
+ interface DebugReportCommand {
69
+ command: "debugReport";
70
+ /**
71
+ * Send event log entries.
72
+ */
73
+ sendEventLog?: true;
74
+ /**
75
+ * Send basic attendee statistics (like count of connected).
76
+ */
77
+ reportAttendees?: true;
78
+ }
79
+
53
80
  /**
54
81
  * Instructs a child process to disconnect from a Fluid container.
55
82
  * A {@link DisconnectedSelfEvent} should be expected in response.
@@ -127,6 +154,7 @@ export type MessageFromChild =
127
154
  | AttendeeConnectedEvent
128
155
  | AttendeeDisconnectedEvent
129
156
  | ConnectedEvent
157
+ | DebugReportCompleteEvent
130
158
  | DisconnectedSelfEvent
131
159
  | ErrorEvent
132
160
  | LatestMapValueGetResponseEvent
@@ -167,6 +195,14 @@ interface ConnectedEvent {
167
195
  attendeeId: AttendeeId;
168
196
  }
169
197
 
198
+ /**
199
+ * Sent from the child processes to the orchestrator in response to a {@link DebugReportCommand}.
200
+ */
201
+ interface DebugReportCompleteEvent {
202
+ event: "debugReportComplete";
203
+ log?: EventEntry[];
204
+ }
205
+
170
206
  /**
171
207
  * Sent from the child processes to the orchestrator in response to a {@link DisconnectSelfCommand}.
172
208
  */
@@ -3,7 +3,8 @@
3
3
  * Licensed under the MIT License.
4
4
  */
5
5
 
6
- import { fork, type ChildProcess } from "node:child_process";
6
+ import { fork } from "node:child_process";
7
+ import type { ChildProcess as AnyChildProcess } from "node:child_process";
7
8
 
8
9
  import { ScopeType } from "@fluidframework/driver-definitions/legacy";
9
10
  import type { AttendeeId } from "@fluidframework/presence/beta";
@@ -16,6 +17,8 @@ import type {
16
17
  LatestMapValueUpdatedEvent,
17
18
  LatestValueGetResponseEvent,
18
19
  LatestMapValueGetResponseEvent,
20
+ MessageToChild,
21
+ EventEntry,
19
22
  } from "./messageTypes.js";
20
23
 
21
24
  /**
@@ -39,6 +42,10 @@ export const testConsole = {
39
42
  error: console.error,
40
43
  };
41
44
 
45
+ interface ChildProcess extends AnyChildProcess {
46
+ send(message: MessageToChild): boolean;
47
+ }
48
+
42
49
  /**
43
50
  * Fork child processes to simulate multiple Fluid clients.
44
51
  *
@@ -85,9 +92,12 @@ export async function forkChildProcesses(
85
92
  });
86
93
  childReadyPromises.push(readyPromise);
87
94
  const errorPromise = new Promise<never>((_resolve, reject) => {
88
- child.on("error", (error) => {
95
+ child.once("error", (error) => {
89
96
  reject(new Error(`Child${i} process errored: ${error.message}`));
90
97
  });
98
+ child.once("exit", (code, signal) => {
99
+ reject(new Error(`Child${i} process exited: code ${code}, signal ${signal}`));
100
+ });
91
101
  });
92
102
  childErrorPromises.push(errorPromise);
93
103
  child.send({ command: "ping" });
@@ -98,6 +108,42 @@ export async function forkChildProcesses(
98
108
  return { children, childErrorPromise };
99
109
  }
100
110
 
111
+ /**
112
+ * Instructs all listed child processes to send debug reports and then the
113
+ * collection is output sorted by timestamp. Report content is up to the child
114
+ * processes, but typically includes messages sent and some telemetry events.
115
+ */
116
+ export async function executeDebugReports(
117
+ childrenRequestedToReport: ChildProcess[],
118
+ ): Promise<void> {
119
+ const debugReportPromises: Promise<EventEntry[]>[] = [];
120
+ for (const child of childrenRequestedToReport) {
121
+ const debugReportPromise = new Promise<EventEntry[]>((resolve) => {
122
+ const handler = (msg: MessageFromChild): void => {
123
+ if (msg.event === "debugReportComplete") {
124
+ resolve(msg.log ?? []);
125
+ child.off("message", handler);
126
+ }
127
+ };
128
+ child.on("message", handler);
129
+ });
130
+ debugReportPromises.push(debugReportPromise);
131
+ child.send({ command: "debugReport", sendEventLog: true, reportAttendees: true });
132
+ }
133
+
134
+ const logs = await Promise.all(debugReportPromises);
135
+ const combinedLogs = logs.flat().sort((a, b) => a.timestamp - b.timestamp);
136
+ for (const entry of combinedLogs) {
137
+ testConsole.log(
138
+ `[${new Date(entry.timestamp).toISOString()}] [${entry.agentId}] [${entry.eventCategory}] ${entry.eventName}${
139
+ entry.details
140
+ ? ` - ${typeof entry.details === "string" ? entry.details : JSON.stringify(entry.details)}`
141
+ : ""
142
+ }`,
143
+ );
144
+ }
145
+ }
146
+
101
147
  /**
102
148
  * Creates a {@link ConnectCommand} for a test user with a deterministic id and name.
103
149
  *
@@ -118,6 +164,47 @@ function composeConnectMessage(
118
164
  };
119
165
  }
120
166
 
167
+ /**
168
+ * Listens for a "connected" response from a child process
169
+ * allowing/handling subset of other expected messages.
170
+ */
171
+ function listenForConnectedResponse({
172
+ child,
173
+ childId,
174
+ onConnected,
175
+ reject,
176
+ }: {
177
+ child: ChildProcess;
178
+ childId: number | string;
179
+ /**
180
+ * Will be called up to once when a "connected" message is received.
181
+ */
182
+ onConnected: (msg: Extract<MessageFromChild, { event: "connected" }>) => void;
183
+ /**
184
+ * Callback to reject for unexpected messages or child errors.
185
+ */
186
+ reject: (reason?: unknown) => void;
187
+ }): void {
188
+ const listener = (msg: MessageFromChild): void => {
189
+ if (msg.event === "connected") {
190
+ child.off("message", listener);
191
+ onConnected(msg);
192
+ } else if (msg.event === "error") {
193
+ child.off("message", listener);
194
+ reject(new Error(`Child ${childId} process error: ${msg.error}`));
195
+ } else if (msg.event !== "ack") {
196
+ child.off("message", listener);
197
+ // This is not strictly required, but is current expectation.
198
+ reject(
199
+ new Error(
200
+ `Unexpected message from child ${childId} while connecting: ${JSON.stringify(msg)}`,
201
+ ),
202
+ );
203
+ }
204
+ };
205
+ child.on("message", listener);
206
+ }
207
+
121
208
  interface CreatorAttendeeIdAndAttendeePromises {
122
209
  containerCreatorAttendeeId: AttendeeId;
123
210
  attendeeIdPromises: Promise<AttendeeId>[];
@@ -140,18 +227,27 @@ export async function connectChildProcesses(
140
227
  const containerReadyPromise = new Promise<{
141
228
  containerCreatorAttendeeId: AttendeeId;
142
229
  containerId: string;
143
- }>((resolve, reject) => {
144
- firstChild.once("message", (msg: MessageFromChild) => {
145
- if (msg.event === "connected" && msg.containerId) {
146
- resolve({
147
- containerCreatorAttendeeId: msg.attendeeId,
148
- containerId: msg.containerId,
149
- });
150
- } else {
151
- reject(new Error(`Non-connected message from child0: ${JSON.stringify(msg)}`));
152
- }
153
- });
154
- });
230
+ }>((resolve, reject) =>
231
+ listenForConnectedResponse({
232
+ child: firstChild,
233
+ childId: 0,
234
+ onConnected: (msg) => {
235
+ if (msg.containerId) {
236
+ resolve({
237
+ containerCreatorAttendeeId: msg.attendeeId,
238
+ containerId: msg.containerId,
239
+ });
240
+ } else {
241
+ reject(
242
+ new Error(
243
+ `Child 0 (creator) connected without containerId: ${JSON.stringify(msg)}`,
244
+ ),
245
+ );
246
+ }
247
+ },
248
+ reject,
249
+ }),
250
+ );
155
251
  {
156
252
  // Note that DocWrite is used to have this attendee be the "leader".
157
253
  // DocRead would also be valid as DocWrite is specified for attach when there
@@ -182,15 +278,16 @@ export async function connectChildProcesses(
182
278
  );
183
279
  message.containerId = containerId;
184
280
  attendeeIdPromises.push(
185
- new Promise<AttendeeId>((resolve, reject) => {
186
- child.once("message", (msg: MessageFromChild) => {
187
- if (msg.event === "connected") {
281
+ new Promise<AttendeeId>((resolve, reject) =>
282
+ listenForConnectedResponse({
283
+ child,
284
+ childId: index,
285
+ onConnected: (msg) => {
188
286
  resolve(msg.attendeeId);
189
- } else if (msg.event === "error") {
190
- reject(new Error(`Child process error: ${msg.error}`));
191
- }
192
- });
193
- }),
287
+ },
288
+ reject,
289
+ }),
290
+ ),
194
291
  );
195
292
  child.send(message);
196
293
  }
@@ -236,6 +333,7 @@ export async function connectAndListenForAttendees(
236
333
  if (msg.event === "attendeeConnected") {
237
334
  attendeesJoinedEvents++;
238
335
  if (attendeesJoinedEvents >= attendeeCountRequired) {
336
+ child.off("message", listenForAttendees);
239
337
  resolve();
240
338
  }
241
339
  }
@@ -323,7 +421,7 @@ export async function connectAndWaitForAttendees(
323
421
  export async function registerWorkspaceOnChildren(
324
422
  children: ChildProcess[],
325
423
  workspaceId: string,
326
- options: { latest?: boolean; latestMap?: boolean; timeoutMs: number },
424
+ options: { latest?: true; latestMap?: true; timeoutMs: number },
327
425
  ): Promise<void> {
328
426
  const { latest, latestMap, timeoutMs } = options;
329
427
  const promises = children.map(async (child, index) => {
@@ -15,6 +15,7 @@ import {
15
15
  connectAndListenForAttendees,
16
16
  connectAndWaitForAttendees,
17
17
  connectChildProcesses,
18
+ executeDebugReports,
18
19
  forkChildProcesses,
19
20
  getLatestMapValueResponses,
20
21
  getLatestValueResponses,
@@ -46,7 +47,7 @@ const timeoutMultiplier = debuggerAttached ? 1000 : useAzure ? 3 : 1;
46
47
  * @param context - The Mocha test context.
47
48
  * @param duration - The duration in milliseconds to set the timeout to. Zero disables the timeout.
48
49
  */
49
- function setTimeout(context: Mocha.Context, duration: number): void {
50
+ function setTestTimeout(context: Mocha.Context, duration: number): void {
50
51
  const currentTimeout = context.timeout();
51
52
  const newTimeout =
52
53
  debuggerAttached || currentTimeout === 0 || duration === 0
@@ -86,11 +87,6 @@ function setTimeout(context: Mocha.Context, duration: number): void {
86
87
  * - Send response messages including any relevant data back to the orchestrator to verify expected behavior.
87
88
  */
88
89
 
89
- /**
90
- * This particular test suite tests the following E2E functionality for Presence:
91
- * - Announce 'attendeeConnected' when remote client joins session.
92
- * - Announce 'attendeeDisconnected' when remote client disconnects.
93
- */
94
90
  describe(`Presence with AzureClient`, () => {
95
91
  const afterCleanUp: (() => void)[] = [];
96
92
 
@@ -112,9 +108,13 @@ describe(`Presence with AzureClient`, () => {
112
108
  */
113
109
  const childConnectTimeoutMs = 1000 * numClients * timeoutMultiplier;
114
110
  /**
115
- * Timeout for presence attendees to connect {@link AttendeeConnectedEvent}
111
+ * Timeout for presence attendees to join per first child perspective {@link AttendeeConnectedEvent}
116
112
  */
117
113
  const allAttendeesJoinedTimeoutMs = (1000 + 200 * numClients) * timeoutMultiplier;
114
+ /**
115
+ * Timeout for presence attendees to fully join (everyone knows about everyone) {@link AttendeeConnectedEvent}
116
+ */
117
+ const allAttendeesFullyJoinedTimeoutMs = (2000 + 300 * numClients) * timeoutMultiplier;
118
118
 
119
119
  for (const writeClients of [numClients, 1]) {
120
120
  it(`announces 'attendeeConnected' when remote client joins session [${numClients} clients, ${writeClients} writers]`, async function () {
@@ -123,7 +123,7 @@ describe(`Presence with AzureClient`, () => {
123
123
  this.skip();
124
124
  }
125
125
 
126
- setTimeout(this, childConnectTimeoutMs + allAttendeesJoinedTimeoutMs + 1000);
126
+ setTestTimeout(this, childConnectTimeoutMs + allAttendeesJoinedTimeoutMs + 1000);
127
127
 
128
128
  // Setup
129
129
  const { children, childErrorPromise } = await forkChildProcesses(
@@ -144,11 +144,8 @@ describe(`Presence with AzureClient`, () => {
144
144
  );
145
145
  });
146
146
 
147
- it(`announces 'attendeeDisconnected' when remote client disconnects [${numClients} clients, ${writeClients} writers]`, async function () {
148
- // AB#48866: Fix intermittently failing presence tests
149
- if (useAzure) {
150
- this.skip();
151
- }
147
+ // Even at 5 clients reaching fully connected state is unreliable with current implementation.
148
+ it.skip(`announces 'attendeeDisconnected' when remote client disconnects [${numClients} clients, ${writeClients} writers]`, async function () {
152
149
  // TODO: AB#45620: "Presence: perf: update Join pattern for scale" can handle
153
150
  // larger counts of read-only attendees. Without protocol changes tests with
154
151
  // 20+ attendees exceed current limits.
@@ -156,12 +153,19 @@ describe(`Presence with AzureClient`, () => {
156
153
  this.skip();
157
154
  }
158
155
 
156
+ if (useAzure && numClients > 50) {
157
+ // Even with increased timeouts, more than 50 clients can be too large for AFR.
158
+ // This may be due to slow responses/inactivity from the clients that are
159
+ // creating pressure on ADO agent.
160
+ this.skip();
161
+ }
162
+
159
163
  const childDisconnectTimeoutMs = 10_000 * timeoutMultiplier;
160
164
 
161
- setTimeout(
165
+ setTestTimeout(
162
166
  this,
163
167
  childConnectTimeoutMs +
164
- allAttendeesJoinedTimeoutMs +
168
+ allAttendeesFullyJoinedTimeoutMs +
165
169
  childDisconnectTimeoutMs +
166
170
  1000,
167
171
  );
@@ -172,31 +176,91 @@ describe(`Presence with AzureClient`, () => {
172
176
  afterCleanUp,
173
177
  );
174
178
 
179
+ const startConnectAndFullJoin = performance.now();
175
180
  const connectResult = await connectAndListenForAttendees(children, {
176
181
  writeClients,
177
182
  attendeeCountRequired: numClients - 1,
178
183
  childConnectTimeoutMs,
179
184
  });
185
+ // eslint-disable-next-line @typescript-eslint/no-floating-promises
186
+ connectResult.attendeeCountRequiredPromises[0].then(() =>
187
+ testConsole.log(
188
+ `[${new Date().toISOString()}] All attendees joined per child 0 after ${performance.now() - startConnectAndFullJoin}ms`,
189
+ ),
190
+ );
180
191
 
181
192
  // Wait for all attendees to be fully joined
182
193
  // Keep a tally for debuggability
183
194
  let childrenFullyJoined = 0;
195
+ const setNotFullyJoined = new Set<number>();
196
+ for (let i = 0; i < children.length; i++) {
197
+ setNotFullyJoined.add(i);
198
+ }
184
199
  const allAttendeesFullyJoined = Promise.all(
185
- // eslint-disable-next-line @typescript-eslint/promise-function-async
186
- connectResult.attendeeCountRequiredPromises.map((attendeeFullyJoinedPromise) =>
187
- attendeeFullyJoinedPromise.then(() => childrenFullyJoined++),
200
+ connectResult.attendeeCountRequiredPromises.map(
201
+ async (attendeeFullyJoinedPromise, index) => {
202
+ await attendeeFullyJoinedPromise;
203
+ childrenFullyJoined++;
204
+ setNotFullyJoined.delete(index);
205
+ },
188
206
  ),
189
207
  );
190
- await timeoutAwait(allAttendeesFullyJoined, {
191
- durationMs: allAttendeesJoinedTimeoutMs,
208
+ let timedout = true;
209
+ const allFullyJoinedOrChildError = Promise.race([
210
+ allAttendeesFullyJoined,
211
+ childErrorPromise,
212
+ ]).finally(() => (timedout = false));
213
+ await timeoutAwait(allFullyJoinedOrChildError, {
214
+ durationMs: allAttendeesFullyJoinedTimeoutMs,
192
215
  errorMsg: "Not all attendees fully joined",
193
- }).catch((error) => {
216
+ }).catch(async (error) => {
194
217
  // Ideally this information would just be in the timeout error message, but that
195
218
  // must be a resolved string (not dynamic). So, just log it separately.
196
- testConsole.log(`${childrenFullyJoined} attendees fully joined before error...`);
219
+ testConsole.log(
220
+ `[${new Date().toISOString()}] ${childrenFullyJoined} attendees fully joined before error...`,
221
+ );
222
+ if (timedout) {
223
+ // Gather additional timing data if timed out to understand what increased
224
+ // timeout could work. Test will still fail if this secondary wait succeeds.
225
+ const startAdditionalWait = performance.now();
226
+ try {
227
+ await timeoutAwait(allFullyJoinedOrChildError, {
228
+ durationMs: allAttendeesFullyJoinedTimeoutMs,
229
+ });
230
+ testConsole.log(
231
+ `[${new Date().toISOString()}] All attendees fully joined after additional wait (${performance.now() - startAdditionalWait}ms)`,
232
+ );
233
+ } catch (secondaryError) {
234
+ testConsole.log(
235
+ `[${new Date().toISOString()}] Secondary await resulted in`,
236
+ secondaryError,
237
+ );
238
+ }
239
+ }
240
+
241
+ // Gather and report debug info from children
242
+ // If there are less than 10 children, get all reports.
243
+ // Otherwise, just child 0 and those not fully joined.
244
+ setTestTimeout(this, 0); // Disable test timeout. Will throw within 20s below.
245
+ const childrenRequestedToReport =
246
+ children.length <= 10
247
+ ? children
248
+ : // Just those not fully joined
249
+ children.filter((_, index) => index === 0 || setNotFullyJoined.has(index));
250
+ await timeoutAwait(
251
+ Promise.race([executeDebugReports(childrenRequestedToReport), childErrorPromise]),
252
+ { durationMs: 20_000, errorMsg: "Debug report timeout" },
253
+ ).catch((debugAwaitError) => {
254
+ testConsole.error("Debug report await resulted in", debugAwaitError);
255
+ });
256
+
197
257
  throw error;
198
258
  });
259
+ testConsole.log(
260
+ `[${new Date().toISOString()}] All attendees fully joined after ${performance.now() - startConnectAndFullJoin}ms`,
261
+ );
199
262
 
263
+ let child0ReportRequested = false;
200
264
  const waitForDisconnected = children.map(async (child, index) =>
201
265
  index === 0
202
266
  ? Promise.resolve()
@@ -216,7 +280,23 @@ describe(`Presence with AzureClient`, () => {
216
280
  durationMs: childDisconnectTimeoutMs,
217
281
  errorMsg: `Attendee[${index}] Disconnected Timeout`,
218
282
  },
219
- ),
283
+ ).catch(async (error) => {
284
+ const childrenRequestedToReport = [child];
285
+ if (!child0ReportRequested) {
286
+ childrenRequestedToReport.unshift(children[0]);
287
+ child0ReportRequested = true;
288
+ }
289
+ await timeoutAwait(
290
+ Promise.race([
291
+ executeDebugReports(childrenRequestedToReport),
292
+ childErrorPromise,
293
+ ]),
294
+ { durationMs: 20_000, errorMsg: "Debug report timeout" },
295
+ ).catch((debugAwaitError) => {
296
+ testConsole.error("Debug report await resulted in", debugAwaitError);
297
+ });
298
+ throw error;
299
+ }),
220
300
  );
221
301
 
222
302
  // Act - disconnect first child process
package/src/test/utils.ts CHANGED
@@ -59,7 +59,7 @@ export const configProvider = (
59
59
  * Currently, there is only a test-set for Durable containers and one for Ephemeral containers.
60
60
  * The Ephemeral container tests will not run for local tests.
61
61
  *
62
- * @returns - The test matrix
62
+ * @returns The test matrix
63
63
  */
64
64
  export function getTestMatrix(): { variant: string; options: { isEphemeral: boolean } }[] {
65
65
  const testMatrix = [