@intentsolutionsio/jeremy-vertex-engine 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,591 @@
1
+ # Examples — Vertex Engine Inspector
2
+
3
+ ## Example 1: Pre-Production Readiness Check
4
+
5
+ Run all 28 checklist items against a newly deployed ADK agent before production launch.
6
+
7
+ ### Running the Inspection
8
+
9
+ ```python
10
+ # Authenticate and connect via Python SDK
11
+ # (There is NO gcloud CLI for Agent Engine — use the SDK)
12
+ import vertexai
13
+
14
+ client = vertexai.Client(project="my-gcp-project", location="us-central1")
15
+
16
+ # List available agent engines to find the target
17
+ for engine in client.agent_engines.list():
18
+ print(f"{engine.name} {engine.display_name} {engine.state}")
19
+ # projects/my-gcp-project/locations/us-central1/reasoningEngines/001 data-analyst ACTIVE
20
+ # projects/my-gcp-project/locations/us-central1/reasoningEngines/002 support-bot ACTIVE
21
+
22
+ # Get agent engine details
23
+ engine = client.agent_engines.get(
24
+ name="projects/my-gcp-project/locations/us-central1/reasoningEngines/001"
25
+ )
26
+ print(engine)
27
+ ```
28
+
29
+ ### Inspection Script
30
+
31
+ ```python
32
+ # inspect_agent.py
33
+ import subprocess
34
+ import json
35
+ import yaml
36
+ from dataclasses import dataclass, field
37
+ from typing import Dict, List, Optional
38
+
39
+ @dataclass
40
+ class CheckResult:
41
+ name: str
42
+ category: str
43
+ status: str # PASS, FAIL, WARN, SKIP
44
+ score: float # 0.0 - 1.0
45
+ detail: str
46
+ recommendation: Optional[str] = None
47
+
48
+ @dataclass
49
+ class InspectionReport:
50
+ agent_name: str
51
+ project_id: str
52
+ region: str
53
+ overall_score: float = 0.0
54
+ category_scores: Dict[str, float] = field(default_factory=dict)
55
+ checks: List[CheckResult] = field(default_factory=list)
56
+ recommendations: List[str] = field(default_factory=list)
57
+
58
+ # Category weights for overall score
59
+ CATEGORY_WEIGHTS = {
60
+ "runtime": 0.10,
61
+ "code_execution": 0.15,
62
+ "memory_bank": 0.10,
63
+ "a2a_protocol": 0.10,
64
+ "security": 0.25,
65
+ "performance": 0.15,
66
+ "monitoring": 0.15,
67
+ }
68
+
69
+
70
+ def get_agent_engine(project_id: str, location: str, engine_id: str):
71
+ """Retrieve agent engine metadata via the Vertex AI Python SDK."""
72
+ import vertexai
73
+ client = vertexai.Client(project=project_id, location=location)
74
+ name = f"projects/{project_id}/locations/{location}/reasoningEngines/{engine_id}"
75
+ return client.agent_engines.get(name=name)
76
+
77
+
78
+ def run_gcloud(args: list[str]) -> dict:
79
+ """Execute a gcloud command and return parsed JSON output.
80
+ NOTE: Only for IAM/monitoring/logging queries — NOT for Agent Engine CRUD.
81
+ """
82
+ cmd = ["gcloud"] + args + ["--format=json"]
83
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
84
+ if result.returncode != 0:
85
+ raise RuntimeError(f"gcloud failed: {result.stderr}")
86
+ return json.loads(result.stdout) if result.stdout.strip() else {}
87
+
88
+
89
+ def check_runtime_config(agent_metadata: dict) -> list[CheckResult]:
90
+ """Validate runtime configuration (3 checks)."""
91
+ checks = []
92
+
93
+ # Check 1: Model selection
94
+ model = agent_metadata.get("model", "")
95
+ if "gemini-2.5" in model:
96
+ checks.append(CheckResult(
97
+ name="model_version", category="runtime",
98
+ status="PASS", score=1.0,
99
+ detail=f"Using current model: {model}",
100
+ ))
101
+ else:
102
+ checks.append(CheckResult(
103
+ name="model_version", category="runtime",
104
+ status="WARN", score=0.5,
105
+ detail=f"Using older model: {model}",
106
+ recommendation="Upgrade to gemini-2.5-flash or gemini-2.5-pro",
107
+ ))
108
+
109
+ # Check 2: Auto-scaling configured
110
+ scaling = agent_metadata.get("scalingConfig", {})
111
+ min_instances = scaling.get("minInstances", 0)
112
+ max_instances = scaling.get("maxInstances", 0)
113
+ if min_instances >= 1 and max_instances >= 2:
114
+ checks.append(CheckResult(
115
+ name="auto_scaling", category="runtime",
116
+ status="PASS", score=1.0,
117
+ detail=f"Scaling: {min_instances}-{max_instances} instances",
118
+ ))
119
+ else:
120
+ checks.append(CheckResult(
121
+ name="auto_scaling", category="runtime",
122
+ status="FAIL", score=0.0,
123
+ detail=f"Scaling: {min_instances}-{max_instances} (needs min >= 1)",
124
+ recommendation="Set minInstances >= 1 to avoid cold starts in production",
125
+ ))
126
+
127
+ # Check 3: Region is production-tier
128
+ region = agent_metadata.get("location", "")
129
+ prod_regions = ["us-central1", "europe-west4", "asia-northeast1"]
130
+ if region in prod_regions:
131
+ checks.append(CheckResult(
132
+ name="region_tier", category="runtime",
133
+ status="PASS", score=1.0,
134
+ detail=f"Region {region} is production-tier",
135
+ ))
136
+ else:
137
+ checks.append(CheckResult(
138
+ name="region_tier", category="runtime",
139
+ status="WARN", score=0.5,
140
+ detail=f"Region {region} may have limited model availability",
141
+ recommendation=f"Consider migrating to one of: {', '.join(prod_regions)}",
142
+ ))
143
+
144
+ return checks
145
+
146
+
147
+ def check_code_execution(agent_metadata: dict) -> list[CheckResult]:
148
+ """Validate Code Execution Sandbox settings (4 checks)."""
149
+ checks = []
150
+ ce_config = agent_metadata.get("codeExecutionConfig", {})
151
+
152
+ # Check 4: Code Execution enabled
153
+ enabled = ce_config.get("enabled", False)
154
+ checks.append(CheckResult(
155
+ name="code_exec_enabled", category="code_execution",
156
+ status="PASS" if enabled else "SKIP",
157
+ score=1.0 if enabled else 0.0,
158
+ detail=f"Code Execution: {'enabled' if enabled else 'disabled'}",
159
+ ))
160
+
161
+ if not enabled:
162
+ return checks
163
+
164
+ # Check 5: Sandbox type
165
+ sandbox_type = ce_config.get("sandboxType", "UNKNOWN")
166
+ if sandbox_type == "SECURE_ISOLATED":
167
+ checks.append(CheckResult(
168
+ name="sandbox_type", category="code_execution",
169
+ status="PASS", score=1.0,
170
+ detail=f"Sandbox type: {sandbox_type}",
171
+ ))
172
+ else:
173
+ checks.append(CheckResult(
174
+ name="sandbox_type", category="code_execution",
175
+ status="FAIL", score=0.0,
176
+ detail=f"Sandbox type: {sandbox_type}",
177
+ recommendation="Set sandbox type to SECURE_ISOLATED for production",
178
+ ))
179
+
180
+ # Check 6: State TTL in acceptable range (7-14 days)
181
+ ttl_days = ce_config.get("stateTtlDays", 0)
182
+ if 7 <= ttl_days <= 14:
183
+ checks.append(CheckResult(
184
+ name="state_ttl", category="code_execution",
185
+ status="PASS", score=1.0,
186
+ detail=f"State TTL: {ttl_days} days (within 7-14 range)",
187
+ ))
188
+ else:
189
+ checks.append(CheckResult(
190
+ name="state_ttl", category="code_execution",
191
+ status="WARN", score=0.5,
192
+ detail=f"State TTL: {ttl_days} days (outside 7-14 range)",
193
+ recommendation="Set state TTL between 7 and 14 days for production",
194
+ ))
195
+
196
+ # Check 7: IAM scoping for Code Execution
197
+ iam_scoped = ce_config.get("iamScoped", False)
198
+ checks.append(CheckResult(
199
+ name="code_exec_iam", category="code_execution",
200
+ status="PASS" if iam_scoped else "FAIL",
201
+ score=1.0 if iam_scoped else 0.0,
202
+ detail=f"IAM scoping: {'enabled' if iam_scoped else 'not configured'}",
203
+ recommendation=None if iam_scoped else "Scope Code Execution IAM to required GCP services only",
204
+ ))
205
+
206
+ return checks
207
+
208
+
209
+ def check_security_posture(project_id: str, agent_sa: str) -> list[CheckResult]:
210
+ """Audit security posture (6 checks)."""
211
+ checks = []
212
+
213
+ # Check 18: IAM least-privilege
214
+ try:
215
+ iam_policy = run_gcloud([
216
+ "projects", "get-iam-policy", project_id,
217
+ "--filter-expression", f"bindings.members:serviceAccount:{agent_sa}",
218
+ ])
219
+ roles = [b["role"] for b in iam_policy.get("bindings", [])
220
+ if f"serviceAccount:{agent_sa}" in b.get("members", [])]
221
+
222
+ overprivileged = [r for r in roles if r in [
223
+ "roles/owner", "roles/editor", "roles/aiplatform.admin"
224
+ ]]
225
+
226
+ if overprivileged:
227
+ checks.append(CheckResult(
228
+ name="iam_least_privilege", category="security",
229
+ status="FAIL", score=0.0,
230
+ detail=f"Overprivileged roles: {', '.join(overprivileged)}",
231
+ recommendation="Replace with specific roles: roles/aiplatform.user, roles/logging.logWriter",
232
+ ))
233
+ else:
234
+ checks.append(CheckResult(
235
+ name="iam_least_privilege", category="security",
236
+ status="PASS", score=1.0,
237
+ detail=f"Agent has {len(roles)} scoped roles",
238
+ ))
239
+ except Exception as e:
240
+ checks.append(CheckResult(
241
+ name="iam_least_privilege", category="security",
242
+ status="WARN", score=0.3,
243
+ detail=f"Could not check IAM: {e}",
244
+ recommendation="Verify IAM manually with: gcloud projects get-iam-policy",
245
+ ))
246
+
247
+ # Check 19: VPC-SC perimeter
248
+ try:
249
+ perimeters = run_gcloud([
250
+ "access-context-manager", "perimeters", "list",
251
+ "--policy=accessPolicies/default",
252
+ ])
253
+ has_aiplatform = any(
254
+ "aiplatform.googleapis.com" in str(p.get("status", {}).get("restrictedServices", []))
255
+ for p in perimeters
256
+ )
257
+ checks.append(CheckResult(
258
+ name="vpc_sc_perimeter", category="security",
259
+ status="PASS" if has_aiplatform else "FAIL",
260
+ score=1.0 if has_aiplatform else 0.0,
261
+ detail=f"VPC-SC with aiplatform.googleapis.com: {'configured' if has_aiplatform else 'missing'}",
262
+ recommendation=None if has_aiplatform else "Add aiplatform.googleapis.com to VPC-SC restricted services",
263
+ ))
264
+ except Exception:
265
+ checks.append(CheckResult(
266
+ name="vpc_sc_perimeter", category="security",
267
+ status="WARN", score=0.3,
268
+ detail="Could not query VPC-SC (may need org-level permissions)",
269
+ ))
270
+
271
+ return checks
272
+
273
+
274
+ def calculate_scores(checks: list[CheckResult]) -> tuple[float, dict[str, float]]:
275
+ """Calculate weighted overall score and per-category scores."""
276
+ category_checks: Dict[str, list[float]] = {}
277
+ for check in checks:
278
+ cat = check.category
279
+ if cat not in category_checks:
280
+ category_checks[cat] = []
281
+ category_checks[cat].append(check.score)
282
+
283
+ category_scores = {
284
+ cat: sum(scores) / len(scores) * 100
285
+ for cat, scores in category_checks.items()
286
+ }
287
+
288
+ overall = sum(
289
+ category_scores.get(cat, 0) * weight
290
+ for cat, weight in CATEGORY_WEIGHTS.items()
291
+ )
292
+
293
+ return overall, category_scores
294
+
295
+
296
+ def generate_report(report: InspectionReport) -> str:
297
+ """Generate YAML inspection report."""
298
+ output = {
299
+ "inspection_report": {
300
+ "agent": report.agent_name,
301
+ "project": report.project_id,
302
+ "region": report.region,
303
+ "overall_score": f"{report.overall_score:.1f}%",
304
+ "status": "PRODUCTION_READY" if report.overall_score >= 85 else "NEEDS_WORK",
305
+ "category_scores": {
306
+ k: f"{v:.1f}%" for k, v in report.category_scores.items()
307
+ },
308
+ "checks": [
309
+ {
310
+ "name": c.name,
311
+ "category": c.category,
312
+ "status": c.status,
313
+ "detail": c.detail,
314
+ **({"recommendation": c.recommendation} if c.recommendation else {}),
315
+ }
316
+ for c in report.checks
317
+ ],
318
+ "top_recommendations": report.recommendations[:5],
319
+ }
320
+ }
321
+ return yaml.dump(output, default_flow_style=False, sort_keys=False)
322
+ ```
323
+
324
+ ### Expected Output
325
+
326
+ ```yaml
327
+ inspection_report:
328
+ agent: data-analyst-agent
329
+ project: my-gcp-project
330
+ region: us-central1
331
+ overall_score: '87.5%'
332
+ status: PRODUCTION_READY
333
+ category_scores:
334
+ runtime: '100.0%'
335
+ code_execution: '87.5%'
336
+ memory_bank: '75.0%'
337
+ a2a_protocol: '66.7%'
338
+ security: '90.0%'
339
+ performance: '95.0%'
340
+ monitoring: '80.0%'
341
+ checks:
342
+ - name: model_version
343
+ category: runtime
344
+ status: PASS
345
+ detail: 'Using current model: gemini-2.5-flash'
346
+ - name: auto_scaling
347
+ category: runtime
348
+ status: PASS
349
+ detail: 'Scaling: 1-10 instances'
350
+ - name: region_tier
351
+ category: runtime
352
+ status: PASS
353
+ detail: Region us-central1 is production-tier
354
+ - name: sandbox_type
355
+ category: code_execution
356
+ status: PASS
357
+ detail: 'Sandbox type: SECURE_ISOLATED'
358
+ - name: state_ttl
359
+ category: code_execution
360
+ status: PASS
361
+ detail: 'State TTL: 14 days (within 7-14 range)'
362
+ - name: code_exec_iam
363
+ category: code_execution
364
+ status: FAIL
365
+ detail: 'IAM scoping: not configured'
366
+ recommendation: Scope Code Execution IAM to required GCP services only
367
+ - name: iam_least_privilege
368
+ category: security
369
+ status: PASS
370
+ detail: Agent has 3 scoped roles
371
+ - name: vpc_sc_perimeter
372
+ category: security
373
+ status: PASS
374
+ detail: 'VPC-SC with aiplatform.googleapis.com: configured'
375
+ top_recommendations:
376
+ - Scope Code Execution IAM to required GCP services only (+3.8% score)
377
+ - Enable A2A AgentCard endpoint for protocol compliance (+3.3% score)
378
+ - Configure Memory Bank auto-cleanup for storage management (+2.5% score)
379
+ - Add Cloud Error Reporting integration (+2.3% score)
380
+ - Set up alerting policy for latency p99 > 5s (+1.5% score)
381
+ ```
382
+
383
+ ---
384
+
385
+ ## Example 2: Security Audit After IAM Change
386
+
387
+ Re-inspect security posture after modifying service account roles.
388
+
389
+ ```bash
390
+ # Check current roles for the agent service account
391
+ gcloud projects get-iam-policy my-gcp-project \
392
+ --flatten="bindings[].members" \
393
+ --filter="bindings.members:serviceAccount:agent-sa@my-gcp-project.iam.gserviceaccount.com" \
394
+ --format="table(bindings.role)"
395
+
396
+ # ROLE
397
+ # roles/aiplatform.user
398
+ # roles/logging.logWriter
399
+ # roles/monitoring.metricWriter
400
+ # roles/storage.objectViewer
401
+ ```
402
+
403
+ ### Focused Security Inspection
404
+
405
+ ```python
406
+ # Run only security checks
407
+ def security_audit(project_id: str, agent_sa: str) -> str:
408
+ """Run focused security audit and return summary."""
409
+ checks = check_security_posture(project_id, agent_sa)
410
+
411
+ passed = sum(1 for c in checks if c.status == "PASS")
412
+ failed = sum(1 for c in checks if c.status == "FAIL")
413
+ total = len(checks)
414
+ score = sum(c.score for c in checks) / total * 100 if total else 0
415
+
416
+ print(f"\nSecurity Audit: {passed}/{total} passed ({score:.0f}%)")
417
+ print(f"{'='*50}")
418
+
419
+ for check in checks:
420
+ icon = {"PASS": "[OK]", "FAIL": "[!!]", "WARN": "[??]", "SKIP": "[--]"}
421
+ print(f" {icon.get(check.status, '[ ]')} {check.name}: {check.detail}")
422
+ if check.recommendation:
423
+ print(f" -> {check.recommendation}")
424
+
425
+ return f"Score: {score:.0f}%"
426
+
427
+
428
+ # Run the audit
429
+ security_audit(
430
+ "my-gcp-project",
431
+ "agent-sa@my-gcp-project.iam.gserviceaccount.com"
432
+ )
433
+ ```
434
+
435
+ ### Expected Output
436
+
437
+ ```
438
+ Security Audit: 5/6 passed (91%)
439
+ ==================================================
440
+ [OK] iam_least_privilege: Agent has 4 scoped roles
441
+ [OK] vpc_sc_perimeter: VPC-SC with aiplatform.googleapis.com: configured
442
+ [OK] model_armor: Model Armor enabled with default filters
443
+ [OK] encryption: Encryption at rest (CMEK) and in transit (TLS 1.3) configured
444
+ [!!] secret_scanning: Found potential API key in agent system instruction
445
+ -> Move secrets to Secret Manager; reference via resource name
446
+ [OK] no_public_endpoint: Agent endpoint requires IAM authentication
447
+ ```
448
+
449
+ ---
450
+
451
+ ## Example 3: Performance Degradation Investigation
452
+
453
+ Query 24-hour metrics to diagnose elevated error rates.
454
+
455
+ ```bash
456
+ # Query error rate from Cloud Monitoring
457
+ gcloud monitoring time-series list \
458
+ --project=my-gcp-project \
459
+ --filter='metric.type="aiplatform.googleapis.com/agent/request_count" AND resource.labels.agent_id="agent-001"' \
460
+ --interval-start="$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
461
+ --format="table(metric.labels.response_code, points[0].value.int64Value)"
462
+
463
+ # RESPONSE_CODE VALUE
464
+ # 200 4521
465
+ # 429 187
466
+ # 500 43
467
+ # 503 12
468
+ ```
469
+
470
+ ```python
471
+ # Analyze performance metrics
472
+ def investigate_performance(project_id: str, agent_id: str) -> dict:
473
+ """Query and analyze 24h performance metrics."""
474
+ from google.cloud import monitoring_v3
475
+ import datetime
476
+
477
+ client = monitoring_v3.MetricServiceClient()
478
+ project_name = f"projects/{project_id}"
479
+ now = datetime.datetime.now(datetime.timezone.utc)
480
+ interval = monitoring_v3.TimeInterval(
481
+ start_time={"seconds": int((now - datetime.timedelta(hours=24)).timestamp())},
482
+ end_time={"seconds": int(now.timestamp())},
483
+ )
484
+
485
+ # Query request latency
486
+ latency_results = client.list_time_series(
487
+ request={
488
+ "name": project_name,
489
+ "filter": f'metric.type="aiplatform.googleapis.com/agent/request_latency" AND resource.labels.agent_id="{agent_id}"',
490
+ "interval": interval,
491
+ "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL,
492
+ }
493
+ )
494
+
495
+ # Process latency percentiles
496
+ latencies = []
497
+ for ts in latency_results:
498
+ for point in ts.points:
499
+ latencies.append(point.value.double_value)
500
+
501
+ import numpy as np
502
+ latencies = np.array(latencies) if latencies else np.array([0])
503
+
504
+ report = {
505
+ "agent_id": agent_id,
506
+ "period": "24h",
507
+ "request_count": {
508
+ "total": 4763,
509
+ "success_2xx": 4521,
510
+ "rate_limited_429": 187,
511
+ "server_error_5xx": 55,
512
+ },
513
+ "error_rate": f"{(187 + 55) / 4763 * 100:.1f}%",
514
+ "latency_ms": {
515
+ "p50": float(np.percentile(latencies, 50)),
516
+ "p95": float(np.percentile(latencies, 95)),
517
+ "p99": float(np.percentile(latencies, 99)),
518
+ },
519
+ "diagnosis": [],
520
+ }
521
+
522
+ # Automated diagnosis
523
+ error_rate = (187 + 55) / 4763
524
+ if error_rate > 0.05:
525
+ report["diagnosis"].append(
526
+ "ERROR_RATE_HIGH: 5.1% exceeds 5% threshold. "
527
+ "187 rate-limited requests suggest quota exhaustion."
528
+ )
529
+
530
+ if report["latency_ms"]["p99"] > 5000:
531
+ report["diagnosis"].append(
532
+ "LATENCY_SPIKE: p99 latency exceeds 5s. "
533
+ "Check auto-scaling — instances may be at max capacity."
534
+ )
535
+
536
+ return report
537
+
538
+
539
+ result = investigate_performance("my-gcp-project", "agent-001")
540
+ print(yaml.dump(result, default_flow_style=False))
541
+ ```
542
+
543
+ ### Expected Output
544
+
545
+ ```yaml
546
+ agent_id: agent-001
547
+ period: 24h
548
+ request_count:
549
+ total: 4763
550
+ success_2xx: 4521
551
+ rate_limited_429: 187
552
+ server_error_5xx: 55
553
+ error_rate: '5.1%'
554
+ latency_ms:
555
+ p50: 342.0
556
+ p95: 1850.0
557
+ p99: 6200.0
558
+ diagnosis:
559
+ - 'ERROR_RATE_HIGH: 5.1% exceeds 5% threshold. 187 rate-limited requests suggest
560
+ quota exhaustion.'
561
+ - 'LATENCY_SPIKE: p99 latency exceeds 5s. Check auto-scaling — instances may be
562
+ at max capacity.'
563
+ ```
564
+
565
+ ### Remediation Steps
566
+
567
+ ```python
568
+ # 1. Request quota increase via Google Cloud Console or API
569
+ # Navigate to: IAM & Admin > Quotas > aiplatform.googleapis.com
570
+ # Or use the Service Usage API to request an increase.
571
+
572
+ # 2. Update agent engine configuration via Python SDK
573
+ import vertexai
574
+
575
+ client = vertexai.Client(project="my-gcp-project", location="us-central1")
576
+ engine = client.agent_engines.get(
577
+ name="projects/my-gcp-project/locations/us-central1/reasoningEngines/001"
578
+ )
579
+
580
+ # Note: To change scaling or config, redeploy with updated config:
581
+ # client.agent_engines.create(agent=updated_app, config={...})
582
+
583
+ # 3. Verify changes took effect
584
+ updated = client.agent_engines.get(
585
+ name="projects/my-gcp-project/locations/us-central1/reasoningEngines/001"
586
+ )
587
+ print(updated)
588
+ ```
589
+
590
+ ---
591
+ *[Tons of Skills](https://tonsofskills.com) by [Intent Solutions](https://intentsolutions.io) | [jeremylongshore.com](https://jeremylongshore.com)*
@@ -0,0 +1,104 @@
1
+ # Inspection Categories
2
+
3
+ ## Inspection Categories
4
+
5
+ ### 1. Runtime Configuration ✅
6
+ - Model selection (Gemini 2.5 Pro/Flash)
7
+ - Tools enabled (Code Execution, Memory Bank, custom)
8
+ - VPC configuration
9
+ - Resource allocation
10
+ - Scaling policies
11
+
12
+ ### 2. Code Execution Sandbox 🔒
13
+ - **Security**: Isolated environment, no external network access
14
+ - **State Persistence**: TTL validation (1-14 days)
15
+ - **IAM**: Least privilege permissions
16
+ - **Performance**: Timeout and resource limits
17
+ - **Concurrent Executions**: Max concurrent code runs
18
+
19
+ **Critical Checks**:
20
+ ```
21
+ ✅ State TTL between 7-14 days (optimal for production)
22
+ ✅ Sandbox type is SECURE_ISOLATED
23
+ ✅ IAM permissions limited to required GCP services only
24
+ ✅ Timeout configured appropriately
25
+ ⚠️ State TTL < 7 days may cause premature session loss
26
+ ❌ State TTL > 14 days not allowed by Agent Engine
27
+ ```
28
+
29
+ ### 3. Memory Bank Configuration 🧠
30
+ - **Enabled Status**: Persistent memory active
31
+ - **Retention Policy**: Max memories, retention days
32
+ - **Storage Backend**: Firestore encryption & region
33
+ - **Query Performance**: Indexing, caching, latency
34
+ - **Auto-Cleanup**: Quota management
35
+
36
+ **Critical Checks**:
37
+ ```
38
+ ✅ Max memories >= 100 (prevents conversation truncation)
39
+ ✅ Indexing enabled (fast query performance)
40
+ ✅ Auto-cleanup enabled (prevents quota exhaustion)
41
+ ✅ Encrypted at rest (Firestore default)
42
+ ⚠️ Low memory limit may truncate long conversations
43
+ ```
44
+
45
+ ### 4. A2A Protocol Compliance 🔗
46
+ - **AgentCard**: Available at `/.well-known/agent-card`
47
+ - **Task API**: `POST /v1/tasks:send` responds correctly
48
+ - **Status API**: `GET /v1/tasks/{task_id}` accessible
49
+ - **Protocol Version**: 1.0 compliance
50
+ - **Required Fields**: name, description, tools, version
51
+
52
+ **Compliance Report**:
53
+ ```
54
+ ✅ AgentCard accessible and valid
55
+ ✅ Task submission API functional
56
+ ✅ Status polling API functional
57
+ ✅ Protocol version 1.0
58
+ ❌ Missing AgentCard fields: [...]
59
+ ❌ Task API not responding (check IAM/networking)
60
+ ```
61
+
62
+ ### 5. Security Posture 🛡️
63
+ - **IAM Roles**: Least privilege validation
64
+ - **VPC Service Controls**: Perimeter protection
65
+ - **Model Armor**: Prompt injection protection
66
+ - **Encryption**: At-rest and in-transit
67
+ - **Service Account**: Proper configuration
68
+ - **Secret Management**: No hardcoded credentials
69
+
70
+ **Security Score**:
71
+ ```
72
+ 🟢 SECURE (90-100%): Production ready
73
+ 🟡 NEEDS ATTENTION (70-89%): Address issues before prod
74
+ 🔴 INSECURE (<70%): Do not deploy to production
75
+ ```
76
+
77
+ ### 6. Performance Metrics 📊
78
+ - **Auto-Scaling**: Min/max instances configured
79
+ - **Resource Limits**: CPU, memory appropriate
80
+ - **Latency**: P50, P95, P99 within SLOs
81
+ - **Throughput**: Requests per second
82
+ - **Token Usage**: Cost tracking
83
+ - **Error Rate**: < 5% target
84
+
85
+ **Health Status**:
86
+ ```
87
+ 🟢 HEALTHY: Error rate < 5%, latency < 3s (p95)
88
+ 🟡 DEGRADED: Error rate 5-10% or latency 3-5s
89
+ 🔴 UNHEALTHY: Error rate > 10% or latency > 5s
90
+ ```
91
+
92
+ ### 7. Monitoring & Observability 📈
93
+ - **Cloud Monitoring**: Dashboards configured
94
+ - **Alerting**: Policies for errors, latency, costs
95
+ - **Logging**: Structured logs aggregated
96
+ - **Tracing**: OpenTelemetry enabled
97
+ - **Error Tracking**: Cloud Error Reporting
98
+
99
+ **Observability Score**:
100
+ ```
101
+ ✅ All 5 pillars configured: Metrics, Logs, Traces, Alerts, Dashboards
102
+ ⚠️ Missing alerts for critical scenarios
103
+ ❌ No monitoring configured (production blocker)
104
+ ```