@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import html as html_module
2
2
  import markdown
3
3
  from common import METRIC_IDS, STATUS_PASS, STATUS_FAIL, STATUS_ERROR, STATUS_PARTIAL, STATUS_UNKNOWN, pascal_case_to_title
4
4
  from datetime import datetime, timezone
5
+ from evaluator_resolver import EVALUATOR_REGISTRY
5
6
 
6
7
  def calculate_aggregate_statistics(results):
7
8
  """Calculate aggregate statistics across all evaluation results.
@@ -36,6 +37,7 @@ def calculate_aggregate_statistics(results):
36
37
  scores = []
37
38
  pass_count = 0
38
39
  fail_count = 0
40
+ error_count = 0
39
41
  threshold_value = None
40
42
  prompts_evaluated = 0
41
43
 
@@ -49,19 +51,20 @@ def calculate_aggregate_statistics(results):
49
51
  prompts_evaluated += 1
50
52
  try:
51
53
  score = parsed_data.get(metric_id)
52
-
53
54
  result_status = parsed_data.get('result')
54
-
55
55
  threshold = parsed_data.get('threshold')
56
56
 
57
57
  if score is not None:
58
58
  scores.append(float(score))
59
59
 
60
60
  if result_status:
61
- if str(result_status).lower() == STATUS_PASS:
61
+ status = str(result_status).lower()
62
+ if status == STATUS_PASS:
62
63
  pass_count += 1
63
- elif str(result_status).lower() == STATUS_FAIL:
64
+ elif status == STATUS_FAIL:
64
65
  fail_count += 1
66
+ elif status == STATUS_ERROR:
67
+ error_count += 1
65
68
 
66
69
  if threshold is not None and threshold_value is None:
67
70
  threshold_value = threshold
@@ -69,17 +72,33 @@ def calculate_aggregate_statistics(results):
69
72
  except (ValueError, TypeError):
70
73
  continue
71
74
 
72
- if scores or pass_count > 0 or fail_count > 0:
75
+ # Surface evaluators that ran in any form including those whose only
76
+ # attempts errored. Suppressing error-only evaluators would hide them
77
+ # from the aggregate report (SC-001).
78
+ if scores or pass_count > 0 or fail_count > 0 or error_count > 0:
73
79
  avg_score = sum(scores) / len(scores) if scores else 0
80
+ # Per-evaluator pass rate is "agreement among completed evaluations" —
81
+ # errors are surfaced separately as a count, not folded into the rate.
74
82
  total_evaluated = pass_count + fail_count
75
83
  pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
76
84
 
85
+ # Defensive fallback: if no per-entry threshold was recorded
86
+ # (shouldn't happen — both successful and errored runtime entries
87
+ # carry it — but guard against malformed input), use the registry
88
+ # default. Evaluators with no registry default (e.g. ExactMatch)
89
+ # legitimately have threshold=None.
90
+ if threshold_value is None:
91
+ registry_entry = EVALUATOR_REGISTRY.get(eval_name)
92
+ if registry_entry is not None:
93
+ threshold_value = registry_entry.default_threshold
94
+
77
95
  aggregates[display_name] = {
78
96
  'total_prompts': len(flat_results),
79
97
  'prompts_evaluated': prompts_evaluated,
80
98
  'total_evaluated': total_evaluated,
81
99
  'pass_count': pass_count,
82
100
  'fail_count': fail_count,
101
+ 'error_count': error_count,
83
102
  'pass_rate': pass_rate,
84
103
  'avg_score': avg_score,
85
104
  'threshold': threshold_value,
@@ -127,7 +146,13 @@ def extract_metric_rows(entry):
127
146
  score_val = pick(metric_obj, [metric_id])
128
147
  result_val = pick(metric_obj, ['result'])
129
148
  threshold_val = pick(metric_obj, ['threshold'])
130
- reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
149
+ # Errored entries carry the per-evaluator failure description in `error`
150
+ # (e.g. "Evaluator failed: Connection timeout"). Surface it in the Reason
151
+ # column so HTML readers see why the evaluator couldn't produce a result.
152
+ if result_val == STATUS_ERROR:
153
+ reason_val = metric_obj.get('error', '')
154
+ else:
155
+ reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
131
156
 
132
157
  rows.append({
133
158
  'Metric': display_name,
@@ -138,35 +163,29 @@ def extract_metric_rows(entry):
138
163
  })
139
164
  return rows
140
165
 
141
- def prompt_passed(entry):
142
- """Determine whether a prompt passed all evaluations.
166
+ _CHIP_CLASSES = {
167
+ STATUS_PASS: "status-pass",
168
+ STATUS_FAIL: "status-fail",
169
+ STATUS_PARTIAL: "status-partial",
170
+ STATUS_ERROR: "status-error",
171
+ }
143
172
 
144
- Centralized predicate used by both the summary banner and per-prompt
145
- cards so that pass/fail counts stay consistent.
146
173
 
147
- Called in two contexts:
148
- - On un-flattened results: multi-turn threads have type="multi_turn"
149
- and are evaluated via their summary.overall_status.
150
- - On flattened results (banner counts): individual turns have
151
- status="pass"/"fail"/"error" and are evaluated like single-turn items.
174
+ def _chip_class(status):
175
+ """Map a status value to its chip CSS class. Unknown statuses fall back to status-error."""
176
+ return _CHIP_CLASSES.get(status, "status-error")
152
177
 
153
- A prompt/turn fails when:
154
- - it is a multi-turn thread with overall_status != 'pass', OR
155
- - its status is explicitly 'fail' or 'error', OR
156
- - any metric result is explicitly 'fail'.
157
- Otherwise it is considered passed (including prompts with no metric rows).
178
+
179
+ def classify_attempt(entry):
180
+ """Return one of {pass, fail, partial, error} for an attempt or a thread.
181
+
182
+ For an un-flattened multi-turn thread, returns the thread's overall_status.
183
+ For a single-turn item or a per-turn entry (from a flattened thread), returns
184
+ the entry's status — which is set authoritatively by the runner.
158
185
  """
159
186
  if entry.get("type") == "multi_turn":
160
- summary = entry.get("summary", {})
161
- return summary.get("overall_status") == STATUS_PASS
162
-
163
- status = str(entry.get('status', '')).lower()
164
- if status in (STATUS_FAIL, STATUS_ERROR):
165
- return False
166
- metric_rows = extract_metric_rows(entry)
167
- if any(str(row.get('Result', '')).lower() == STATUS_FAIL for row in metric_rows):
168
- return False
169
- return True
187
+ return entry.get("summary", {}).get("overall_status", STATUS_UNKNOWN)
188
+ return entry.get("status", STATUS_UNKNOWN)
170
189
 
171
190
  def _escape(text):
172
191
  """HTML-escape user-controlled content to prevent XSS."""
@@ -174,6 +193,35 @@ def _escape(text):
174
193
  return ""
175
194
  return html_module.escape(str(text))
176
195
 
196
+
197
+ _CELL_CLASSES = {
198
+ STATUS_PASS: "cell-pass",
199
+ STATUS_FAIL: "cell-fail",
200
+ STATUS_ERROR: "cell-error",
201
+ }
202
+
203
+
204
+ def _render_metric_table(html, rows):
205
+ """Append a metric-table block to ``html`` (no-op if rows is empty)."""
206
+ if not rows:
207
+ return
208
+ html.append(' <table class="metric-table">')
209
+ html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
210
+ for row in rows:
211
+ result_val = str(row.get("Result", "")).lower()
212
+ cell_class = _CELL_CLASSES.get(result_val)
213
+ result_attr = f' class="{cell_class}"' if cell_class else ""
214
+ html.append(
215
+ '<tr>'
216
+ f'<td>{_escape(row.get("Metric", ""))}</td>'
217
+ f'<td{result_attr}>{_escape(str(row.get("Result", "")))}</td>'
218
+ f'<td>{_escape(str(row.get("Score", "")))}</td>'
219
+ f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
220
+ f'<td>{_escape(str(row.get("Reason", "")))}</td>'
221
+ '</tr>'
222
+ )
223
+ html.append(' </table>')
224
+
177
225
  def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
178
226
  aggregates = calculate_aggregate_statistics(results)
179
227
 
@@ -186,9 +234,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
186
234
  flat_items.append(entry)
187
235
  total_prompts = len(flat_items)
188
236
 
189
- passed_prompts = sum(1 for item in flat_items if prompt_passed(item))
190
- failed_prompt_count = total_prompts - passed_prompts
191
- overall_pass_rate = (passed_prompts / total_prompts * 100) if total_prompts else 0
237
+ counts = {STATUS_PASS: 0, STATUS_FAIL: 0, STATUS_PARTIAL: 0, STATUS_ERROR: 0}
238
+ for item in flat_items:
239
+ c = classify_attempt(item)
240
+ if c in counts:
241
+ counts[c] += 1
242
+ incomplete_count = counts[STATUS_PARTIAL] + counts[STATUS_ERROR]
243
+ decisive_count = counts[STATUS_PASS] + counts[STATUS_FAIL]
244
+ overall_pass_rate = (counts[STATUS_PASS] / decisive_count * 100) if decisive_count else 0
192
245
  generated_utc = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
193
246
 
194
247
  html = [
@@ -208,6 +261,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
208
261
  ' --ok-ink: #15603a;',
209
262
  ' --bad-bg: #fdecec;',
210
263
  ' --bad-ink: #8b1e2f;',
264
+ ' --warn-bg: #fff4e0;',
265
+ ' --warn-ink: #8a5a00;',
266
+ ' --neutral-bg: #ececec;',
267
+ ' --neutral-ink: #4a4a4a;',
211
268
  ' --border: #dde2ea;',
212
269
  ' --bar-track: #e8edf5;',
213
270
  ' --bar-fill: #2b6cb0;',
@@ -217,7 +274,7 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
217
274
  ' .container { max-width: 1100px; margin: 0 auto; padding: 24px 18px 40px; }',
218
275
  ' h1 { margin: 0 0 8px; }',
219
276
  ' .meta { color: var(--muted); margin-bottom: 20px; }',
220
- ' .summary-banner { display: grid; grid-template-columns: repeat(4, minmax(140px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
277
+ ' .summary-banner { display: grid; grid-template-columns: repeat(5, minmax(120px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
221
278
  ' .summary-tile { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 14px; }',
222
279
  ' .summary-label { display: block; font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: .06em; }',
223
280
  ' .summary-value { display: block; margin-top: 6px; font-size: 24px; font-weight: 700; }',
@@ -231,6 +288,8 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
231
288
  ' .status-chip { display: inline-block; padding: 3px 8px; border-radius: 999px; font-size: 12px; font-weight: 600; margin-bottom: 10px; }',
232
289
  ' .status-pass { background: var(--ok-bg); color: var(--ok-ink); }',
233
290
  ' .status-fail { background: var(--bad-bg); color: var(--bad-ink); }',
291
+ ' .status-partial { background: var(--warn-bg); color: var(--warn-ink); }',
292
+ ' .status-error { background: var(--neutral-bg); color: var(--neutral-ink); }',
234
293
  ' .prompt-card h3 { margin: 0 0 8px; font-size: 16px; }',
235
294
  ' .kv { margin: 8px 0; }',
236
295
  ' .kv > strong { display: block; min-width: 130px; color: var(--muted); margin-bottom: 4px; }',
@@ -248,9 +307,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
248
307
  ' .metric-table th { background: #f4f6fa; }',
249
308
  ' .metric-table .cell-pass { background: var(--ok-bg); color: var(--ok-ink); font-weight: 600; }',
250
309
  ' .metric-table .cell-fail { background: var(--bad-bg); color: var(--bad-ink); font-weight: 600; }',
310
+ ' .metric-table .cell-error { background: var(--neutral-bg); color: var(--neutral-ink); font-weight: 600; }',
251
311
  ' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
252
312
  ' .footer { margin-top: 20px; color: var(--muted); font-size: 13px; }',
253
- ' @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(140px, 1fr)); } .kv strong { min-width: 90px; } }',
313
+ ' @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(120px, 1fr)); } .kv strong { min-width: 90px; } }',
254
314
  ' </style>',
255
315
  '</head>',
256
316
  '<body>',
@@ -269,9 +329,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
269
329
  html.append(f' <p class="meta">{" | ".join(metadata_items)}</p>')
270
330
 
271
331
  html.append(' <section class="summary-banner" aria-label="summary banner">')
272
- html.append(f' <div class="summary-tile"><span class="summary-label">Total Prompts</span><span class="summary-value">{total_prompts}</span></div>')
273
- html.append(f' <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{passed_prompts}</span></div>')
274
- html.append(f' <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{failed_prompt_count}</span></div>')
332
+ html.append(f' <div class="summary-tile"><span class="summary-label">Total</span><span class="summary-value">{total_prompts}</span></div>')
333
+ html.append(f' <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{counts[STATUS_PASS]}</span></div>')
334
+ html.append(f' <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{counts[STATUS_FAIL]}</span></div>')
335
+ html.append(f' <div class="summary-tile"><span class="summary-label">Incomplete</span><span class="summary-value">{incomplete_count}</span></div>')
275
336
  html.append(f' <div class="summary-tile"><span class="summary-label">Pass Rate</span><span class="summary-value">{overall_pass_rate:.1f}%</span></div>')
276
337
  html.append(' </section>')
277
338
 
@@ -283,11 +344,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
283
344
  prompts_evaluated = stats.get('prompts_evaluated', stats.get('total_evaluated', 0))
284
345
  html.append('<div class="evaluator-row">')
285
346
  avg_score = stats.get('avg_score', 0)
286
- threshold = stats.get('threshold', 'N/A')
347
+ threshold_val = stats.get('threshold')
348
+ threshold_str = "N/A" if threshold_val is None else str(threshold_val)
349
+ error_count = stats.get('error_count', 0)
350
+ error_clause = f' / {error_count} error' if error_count else ''
287
351
  html.append(
288
352
  f'<div class="evaluator-head"><strong>{_escape(metric_name)}</strong>'
289
- f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail, {prompts_evaluated}/{total_prompts} prompts)'
290
- f' &middot; Avg Score: {avg_score:.2f} &middot; Threshold: {_escape(str(threshold))}</span></div>'
353
+ f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail{error_clause}, {prompts_evaluated}/{total_prompts} prompts)'
354
+ f' &middot; Avg Score: {avg_score:.2f} &middot; Threshold: {_escape(threshold_str)}</span></div>'
291
355
  )
292
356
  html.append('<div class="progress-track" role="progressbar" aria-valuemin="0" aria-valuemax="100" aria-valuenow="{:.1f}" aria-label="{} pass rate">'.format(pass_rate, _escape(metric_name)))
293
357
  html.append(f'<div class="progress-fill" style="width:{pass_rate:.1f}%"></div></div>')
@@ -305,27 +369,18 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
305
369
  # Multi-turn thread card
306
370
  thread_name = _escape(entry.get("name", "Unnamed Thread"))
307
371
  summary = entry.get("summary", {})
308
- status = summary.get("overall_status", STATUS_UNKNOWN)
309
- is_passed = status == STATUS_PASS
310
- chip_class = 'status-pass' if is_passed else 'status-fail'
311
- chip_text = 'PASSED' if is_passed else ('PARTIAL' if status == STATUS_PARTIAL else 'FAILED')
372
+ thread_status = summary.get("overall_status", STATUS_UNKNOWN)
312
373
 
313
374
  html.append(' <article class="prompt-card">')
314
- html.append(f' <span class="status-chip {chip_class}">{chip_text}</span>')
375
+ html.append(f' <span class="status-chip {_chip_class(thread_status)}">{thread_status.upper()}</span>')
315
376
  html.append(f' <h3>Thread {idx}: {thread_name}</h3>')
316
377
  html.append(f' <p>{summary.get("turns_passed", 0)}/{summary.get("turns_total", 0)} turns passed</p>')
317
378
 
318
379
  for t_idx, turn in enumerate(entry.get("turns", []), 1):
319
380
  turn_status = turn.get("status", STATUS_UNKNOWN)
320
- turn_chip_class = 'status-pass' if turn_status == STATUS_PASS else 'status-fail'
321
- turn_chip_text = {
322
- STATUS_PASS: 'PASSED',
323
- STATUS_FAIL: 'FAILED',
324
- STATUS_ERROR: 'ERROR',
325
- }.get(turn_status, turn_status.upper())
326
381
 
327
382
  html.append(f' <div style="margin-left:16px;padding:8px 0;border-top:1px solid var(--border);">')
328
- html.append(f' <span class="status-chip {turn_chip_class}">{turn_chip_text}</span>')
383
+ html.append(f' <span class="status-chip {_chip_class(turn_status)}">{turn_status.upper()}</span>')
329
384
  html.append(f' <strong>Turn {t_idx}:</strong> {_escape(turn.get("prompt", ""))}')
330
385
 
331
386
  turn_evaluators = turn.get('evaluators_ran', [])
@@ -335,38 +390,24 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
335
390
 
336
391
  if turn.get("response"):
337
392
  html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(turn.get("response", "")))}</div></div>')
338
- if turn.get("error"):
339
- html.append(f' <p class="kv"><strong>Error:</strong> {_escape(turn["error"])}</p>')
340
-
341
- turn_rows = extract_metric_rows(turn)
342
- if turn_rows:
343
- html.append(' <table class="metric-table">')
344
- html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
345
- for row in turn_rows:
346
- result_val = str(row.get("Result", "")).lower()
347
- result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
348
- html.append(
349
- '<tr>'
350
- f'<td>{_escape(row.get("Metric", ""))}</td>'
351
- f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
352
- f'<td>{_escape(str(row.get("Score", "")))}</td>'
353
- f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
354
- f'<td>{_escape(str(row.get("Reason", "")))}</td>'
355
- '</tr>'
356
- )
357
- html.append(' </table>')
393
+ turn_error = turn.get("error")
394
+ if turn_error:
395
+ html.append(
396
+ f' <p class="kv" data-error-code="{_escape(turn_error.get("code", ""))}">'
397
+ f'<strong>Error:</strong> {_escape(turn_error.get("message", ""))}</p>'
398
+ )
399
+
400
+ _render_metric_table(html, extract_metric_rows(turn))
358
401
 
359
402
  html.append(' </div>')
360
403
 
361
404
  html.append(' </article>')
362
405
  else:
363
406
  score_rows = extract_metric_rows(entry)
364
- is_passed = prompt_passed(entry)
365
- chip_class = 'status-pass' if is_passed else 'status-fail'
366
- chip_text = 'PASSED' if is_passed else 'FAILED'
407
+ item_status = classify_attempt(entry)
367
408
 
368
409
  html.append(' <article class="prompt-card">')
369
- html.append(f' <span class="status-chip {chip_class}">{chip_text}</span>')
410
+ html.append(f' <span class="status-chip {_chip_class(item_status)}">{item_status.upper()}</span>')
370
411
  html.append(f' <h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
371
412
 
372
413
  evaluators_ran = entry.get('evaluators_ran', [])
@@ -377,26 +418,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
377
418
  html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("response", "")))}</div></div>')
378
419
  html.append(f' <div class="kv"><strong>Expected:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("expected_response", "")))}</div></div>')
379
420
 
380
- error_details = entry.get('error_details') or entry.get('errorDetails')
381
- if error_details:
382
- html.append(f' <p class="kv"><strong>Error Details:</strong> {_escape(error_details)}</p>')
421
+ item_error = entry.get('error')
422
+ if item_error:
423
+ html.append(
424
+ f' <p class="kv" data-error-code="{_escape(item_error.get("code", ""))}">'
425
+ f'<strong>Error:</strong> {_escape(item_error.get("message", ""))}</p>'
426
+ )
383
427
 
384
- if score_rows:
385
- html.append(' <table class="metric-table">')
386
- html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
387
- for row in score_rows:
388
- result_val = str(row.get("Result", "")).lower()
389
- result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
390
- html.append(
391
- '<tr>'
392
- f'<td>{_escape(row.get("Metric", ""))}</td>'
393
- f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
394
- f'<td>{_escape(str(row.get("Score", "")))}</td>'
395
- f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
396
- f'<td>{_escape(str(row.get("Reason", "")))}</td>'
397
- '</tr>'
398
- )
399
- html.append(' </table>')
428
+ _render_metric_table(html, score_rows)
400
429
 
401
430
  html.append(' </article>')
402
431
 
@@ -16,7 +16,7 @@ from azure.ai.evaluation import AzureOpenAIModelConfiguration
16
16
  from dotenv import load_dotenv
17
17
 
18
18
  from api_clients.A2A import A2AClient
19
- from auth.auth_handler import AuthHandler
19
+ from auth.auth_handler import AuthHandler, make_token_refresh_fn
20
20
  from evaluator_resolver import resolve_default_evaluators
21
21
  from version_check import check_min_version, get_cli_version
22
22
 
@@ -122,6 +122,7 @@ def main():
122
122
  agent_client = A2AClient(
123
123
  a2a_endpoint=a2a_endpoint,
124
124
  access_token=a2a_access_token,
125
+ token_refresh_fn=make_token_refresh_fn(a2a_auth_handler),
125
126
  logger=CLI_LOGGER,
126
127
  diagnostic_records=DIAGNOSTIC_RECORDS,
127
128
  )
@@ -36,7 +36,7 @@ AZURE_AI_API_KEY="<azure-openai-key>"
36
36
  AZURE_AI_API_VERSION="2024-12-01-preview"
37
37
  AZURE_AI_MODEL_NAME="gpt-4o-mini"
38
38
 
39
- # Your Tenant Id
39
+ # Your Tenant ID (or use TEAMS_APP_TENANT_ID from ATK .env.local)
40
40
  TENANT_ID="<aad-tenant-id>"
41
41
 
42
42
  # Optional: default agent id (overridable via --m365-agent-id)