@xdev-asia/xdev-knowledge-mcp 1.0.52 → 1.0.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,199 +2,707 @@
2
2
  "id": "gcp-ml-engineer",
3
3
  "title": "Google Cloud Professional ML Engineer",
4
4
  "slug": "gcp-ml-engineer",
5
- "description": "Luyện thi chứng chỉ Google Cloud Professional Machine Learning Engineer",
5
+ "description": "Practice exam for Google Cloud Professional Machine Learning Engineer — 50 questions covering all domains",
6
6
  "icon": "award",
7
7
  "provider": "Google Cloud",
8
- "level": "Chuyên nghiệp",
8
+ "level": "Professional",
9
9
  "duration_minutes": 120,
10
10
  "passing_score": 70,
11
- "questions_count": 15,
12
- "tags": [
13
- "GCP",
14
- "ML",
15
- "Vertex AI"
16
- ],
11
+ "questions_count": 50,
12
+ "tags": ["GCP", "ML", "Vertex AI", "BigQuery ML", "MLOps", "TFX"],
17
13
  "series_slug": "luyen-thi-gcp-ml-engineer",
14
+ "domains": [
15
+ {
16
+ "name": "Domain 1: ML Problem Framing & Architecture",
17
+ "weight": 20,
18
+ "lessons": [
19
+ { "title": "Bài 1: Framing ML Problems — Supervised, Unsupervised, RL", "slug": "bai-1-framing-ml-problems" },
20
+ { "title": "Bài 2: GCP AI/ML Ecosystem Overview", "slug": "bai-2-gcp-ai-ml-ecosystem" }
21
+ ]
22
+ },
23
+ {
24
+ "name": "Domain 2: Data Engineering & Feature Engineering",
25
+ "weight": 20,
26
+ "lessons": [
27
+ { "title": "Bài 3: Data Pipeline — Dataflow, Pub/Sub, Dataproc", "slug": "bai-3-data-pipeline" },
28
+ { "title": "Bài 4: Feature Engineering & Vertex AI Feature Store", "slug": "bai-4-feature-engineering" }
29
+ ]
30
+ },
31
+ {
32
+ "name": "Domain 3: Model Development on Vertex AI",
33
+ "weight": 20,
34
+ "lessons": [
35
+ { "title": "Bài 5: Vertex AI Training — Custom & AutoML", "slug": "bai-5-vertex-ai-training" },
36
+ { "title": "Bài 6: BigQuery ML & TensorFlow on GCP", "slug": "bai-6-bigquery-ml-tensorflow" }
37
+ ]
38
+ },
39
+ {
40
+ "name": "Domain 4: Model Deployment & MLOps",
41
+ "weight": 20,
42
+ "lessons": [
43
+ { "title": "Bài 7: Model Deployment & Prediction", "slug": "bai-7-model-deployment" },
44
+ { "title": "Bài 8: Vertex AI Pipelines & MLOps", "slug": "bai-8-vertex-ai-pipelines-mlops" }
45
+ ]
46
+ },
47
+ {
48
+ "name": "Domain 5: Responsible AI, Security & Exam Strategy",
49
+ "weight": 20,
50
+ "lessons": [
51
+ { "title": "Bài 9: Responsible AI & Security", "slug": "bai-9-responsible-ai" },
52
+ { "title": "Bài 10: Cheat Sheet & Chiến lược thi", "slug": "bai-10-cheat-sheet-chien-luoc-thi" }
53
+ ]
54
+ }
55
+ ],
18
56
  "questions": [
19
57
  {
20
58
  "id": 1,
21
- "question": "Vertex AI Pipeline được xây dựng trên framework nào?",
59
+ "domain": "Domain 1: ML Problem Framing & Architecture",
60
+ "question": "A retail company wants to predict which customers will churn in the next 30 days. They have 2 years of historical data with labels. Which ML approach is MOST appropriate?",
22
61
  "options": [
23
- "Apache Spark",
24
- "Kubeflow Pipelines / TFX",
25
- "Apache Airflow",
26
- "Jenkins"
62
+ "Unsupervised clustering to group similar customers",
63
+ "Supervised binary classification using historical churn labels",
64
+ "Reinforcement learning to learn optimal retention actions",
65
+ "Anomaly detection on transaction patterns"
27
66
  ],
28
67
  "correct": 1,
29
- "explanation": "Vertex AI Pipelines dựa trên Kubeflow Pipelines SDK TFX (TensorFlow Extended), cho phép orchestrate ML workflow trên Google Cloud."
68
+ "explanation": "This is a classic supervised binary classification problem predicting a binary outcome (churn/no-churn) using labeled historical data. The target variable is clearly defined and data is available."
30
69
  },
31
70
  {
32
71
  "id": 2,
33
- "question": "BigQuery ML cho phép làm gì đặc biệt?",
72
+ "domain": "Domain 1: ML Problem Framing & Architecture",
73
+ "question": "A team wants to build a product recommendation system but has NO historical interaction data. Which approach should they start with?",
34
74
  "options": [
35
- "Chỉ query dữ liệu",
36
- "Train deploy ML model trực tiếp bằng SQL trong BigQuery",
37
- "Chỉ export dữ liệu sang CSV",
38
- "Quản Kubernetes cluster"
75
+ "Collaborative filtering",
76
+ "Content-based filtering using product attributes",
77
+ "Matrix factorization",
78
+ "Deep neural collaborative filtering"
39
79
  ],
40
80
  "correct": 1,
41
- "explanation": "BigQuery ML (BQML) cho phép data analysts train model ML bằng SQL quen thuộc ngay trong BigQuery không cần viết Python hay setup infrastructure riêng."
81
+ "explanation": "Without user-item interaction data (cold start problem), collaborative filtering won't work. Content-based filtering uses product attributes (category, description, price) to recommend similar items it doesn't require interaction history."
42
82
  },
43
83
  {
44
84
  "id": 3,
45
- "question": "Vertex AI Feature Store khác so với lưu features trong database thông thường?",
85
+ "domain": "Domain 1: ML Problem Framing & Architecture",
86
+ "question": "Which GCP service should you use for a simple image classification task when you have limited ML expertise and a small labeled dataset?",
46
87
  "options": [
47
- "Không khác",
48
- "Hỗ trợ serving features với low-latency, đảm bảo training-serving consistency, và feature versioning",
49
- "Chỉ hỗ trợ structured data",
50
- "Chỉ dùng được với TensorFlow"
88
+ "Vertex AI Custom Training with a custom TensorFlow model",
89
+ "Vertex AI AutoML Vision",
90
+ "Cloud Vision API (pre-trained)",
91
+ "BigQuery ML"
51
92
  ],
52
93
  "correct": 1,
53
- "explanation": "Feature Store chuyên biệt cho ML: serving features online (low-latency) offline (batch), đảm bảo features đồng nhất giữa training serving, hỗ trợ time-travel monitoring."
94
+ "explanation": "AutoML Vision is ideal for custom image classification when you have labeled data but limited ML expertise. Cloud Vision API only supports pre-defined labels. Custom training requires significant ML knowledge."
54
95
  },
55
96
  {
56
97
  "id": 4,
57
- "question": "Khi nào nên dùng AutoML thay vì custom training trên Vertex AI?",
98
+ "domain": "Domain 1: ML Problem Framing & Architecture",
99
+ "question": "A company needs to extract text from scanned documents and receipts. Which GCP service is MOST appropriate?",
58
100
  "options": [
59
- "Khi cần kiểm soát hoàn toàn architecture",
60
- "Khi team không có nhiều ML expertise hoặc cần baseline model nhanh",
61
- "Khi dataset rất lớn (>1TB)",
62
- "Khi cần distributed training"
101
+ "Cloud Natural Language API",
102
+ "Document AI",
103
+ "Cloud Vision API OCR only",
104
+ "Vertex AI AutoML Text"
63
105
  ],
64
106
  "correct": 1,
65
- "explanation": "AutoML phù hợp khi cần model nhanh, team ít ML expertise, hoặc cần baseline. Custom training khi cần kiểm soát architecture, thuật toán đặc thù, hoặc tối ưu sâu."
107
+ "explanation": "Document AI is purpose-built for extracting structured data from documents (invoices, receipts, forms). It goes beyond simple OCR by understanding document structure, key-value pairs, and tables."
66
108
  },
67
109
  {
68
110
  "id": 5,
69
- "question": "Vertex AI Experiments dùng để?",
111
+ "domain": "Domain 1: ML Problem Framing & Architecture",
112
+ "question": "When framing an ML problem, which metric is MOST important to align with stakeholders FIRST?",
70
113
  "options": [
71
- "Deploy model lên production",
72
- "Track, compare và reproduce ML experiments (hyperparameters, metrics, artifacts)",
73
- "Tạo dataset mới",
74
- "Quản lý IAM"
114
+ "Model accuracy on test set",
115
+ "AUC-ROC curve",
116
+ "Business KPI that the model is expected to improve",
117
+ "F1 score"
75
118
  ],
76
- "correct": 1,
77
- "explanation": "Vertex AI Experiments cung cấp experiment tracking: log hyperparameters, metrics, model artifactscho phép compare nhiều runs reproduce kết quả."
119
+ "correct": 2,
120
+ "explanation": "The first step in framing is defining how ML success maps to business outcomes. ML metrics (accuracy, AUC) are proxies stakeholders care about business KPIs (revenue, cost reduction, customer satisfaction)."
78
121
  },
79
122
  {
80
123
  "id": 6,
81
- "question": "TFX (TensorFlow Extended) bao gồm những component chính nào?",
124
+ "domain": "Domain 1: ML Problem Framing & Architecture",
125
+ "question": "A fraud detection system needs to catch 99% of fraud even if some legitimate transactions are flagged. Which metric should be optimized?",
82
126
  "options": [
83
- "Chỉ có ExampleGen và Trainer",
84
- "ExampleGen, StatisticsGen, SchemaGen, ExampleValidator, Transform, Trainer, Evaluator, Pusher",
85
- "Chỉ có Trainer và Serving",
86
- "Chỉ có Transform và Evaluator"
127
+ "Precision",
128
+ "Recall (Sensitivity)",
129
+ "Accuracy",
130
+ "Specificity"
87
131
  ],
88
132
  "correct": 1,
89
- "explanation": "TFX end-to-end ML platform gồm: ExampleGen (ingest), StatisticsGen + SchemaGen + ExampleValidator (validate), Transform (feature eng), Trainer, Tuner, Evaluator, Pusher (deploy)."
133
+ "explanation": "Recall = TP/(TP+FN) measures the fraction of actual positives correctly identified. 99% recall means catching 99% of fraud. Precision would prioritize minimizing false alarms instead."
90
134
  },
91
135
  {
92
136
  "id": 7,
93
- "question": "Vertex AI Model Monitoring kiểm tra điều gì?",
137
+ "domain": "Domain 1: ML Problem Framing & Architecture",
138
+ "question": "Which GCP service decision is correct?",
94
139
  "options": [
95
- "Chỉ monitor CPU/memory",
96
- "Skew (training-serving) drift (prediction data thay đổi theo thời gian)",
97
- "Chỉ monitor latency",
98
- "Chỉ monitor cost"
140
+ "Use BigQuery ML when you need custom PyTorch architectures",
141
+ "Use Vertex AI Custom Training when you need full control over training code, framework, and infrastructure",
142
+ "Use AutoML when you need to implement a custom loss function",
143
+ "Use Cloud Vision API when you need to classify images into custom categories"
99
144
  ],
100
145
  "correct": 1,
101
- "explanation": "Model Monitoring phát hiện: training-serving skew (feature distribution khác nhau) prediction drift (dữ liệu production drift khỏi baseline), trigger alert khi vượt threshold."
146
+ "explanation": "Vertex AI Custom Training gives full control: any framework (TF, PyTorch, XGBoost), custom code, custom containers, distributed training, GPU/TPU selection. AutoML and BigQuery ML have constraints on customization."
102
147
  },
103
148
  {
104
149
  "id": 8,
105
- "question": "Google Cloud AI Platform Prediction hỗ trợ chiến lược deploy nào?",
150
+ "domain": "Domain 1: ML Problem Framing & Architecture",
151
+ "question": "A time series forecasting model needs to predict daily sales for 1,000 products. Which GCP approach is MOST scalable?",
106
152
  "options": [
107
- "Chỉ single model deployment",
108
- "Traffic splitting cho A/B testing canary deployments",
109
- "Chỉ batch prediction",
110
- "Chỉ edge deployment"
153
+ "Train 1,000 individual ARIMA models in Cloud Functions",
154
+ "Use Vertex AI Forecasting (AutoML) which handles multiple time series natively",
155
+ "Use a single linear regression model for all products",
156
+ "Use BigQuery ML's ARIMA_PLUS with a single query"
111
157
  ],
112
158
  "correct": 1,
113
- "explanation": "Vertex AI Prediction hỗ trợ traffic splitting: thể route % traffic sang model versions khác nhau phục vụ A/B testing, canary release, progressive rollout."
159
+ "explanation": "Vertex AI Forecasting (AutoML) is designed for large-scale time series: it handles thousands of series, automatically selects algorithms, and manages training/serving. BigQuery ML ARIMA_PLUS is also viable but AutoML handles more complexity."
114
160
  },
115
161
  {
116
162
  "id": 9,
117
- "question": "Dataflow trong ML pipeline đóng vai trò gì?",
163
+ "domain": "Domain 1: ML Problem Framing & Architecture",
164
+ "question": "A model achieves 99.5% accuracy on a fraud dataset where only 0.5% of transactions are fraud. What is the problem?",
118
165
  "options": [
119
- "Training model",
120
- "Xử dữ liệu quy lớn (batch & streaming) cho data preprocessing/feature engineering",
121
- "Deploy model",
122
- "Monitor model"
166
+ "The model is overfitting",
167
+ "The high accuracy is misleading the model may just predict 'not fraud' for everything (class imbalance)",
168
+ "The model needs more training data",
169
+ "The learning rate is too high"
123
170
  ],
124
171
  "correct": 1,
125
- "explanation": "Dataflow (dựa trên Apache Beam) xử data scale lớn: ETL, feature engineering cho cả batch streaming bước tiền xử quan trọng trong ML pipeline."
172
+ "explanation": "With 99.5% negative class, a model that always predicts 'not fraud' gets 99.5% accuracy. This is the class imbalance problem. Use precision, recall, F1, AUC-PR instead of accuracy for imbalanced datasets."
126
173
  },
127
174
  {
128
175
  "id": 10,
129
- "question": "Vertex AI Matching Engine dùng cho bài toán nào?",
176
+ "domain": "Domain 1: ML Problem Framing & Architecture",
177
+ "question": "Which is NOT a valid reason to choose ML over a rule-based system?",
130
178
  "options": [
131
- "Training model",
132
- "Tìm kiếm nearest neighbor (vector similarity search) quy mô lớn",
133
- "Data labeling",
134
- "Model serving thông thường"
179
+ "The problem involves complex patterns that are hard to specify manually",
180
+ "The relationships in data change over time requiring adaptation",
181
+ "A simple if-else logic with 5 rules can solve the problem with 99% accuracy",
182
+ "The input data is unstructured (images, text, audio)"
135
183
  ],
136
- "correct": 1,
137
- "explanation": "Matching Engine managed approximate nearest neighbor (ANN) service dùng cho similarity search, recommendation, RAG retrieval quy tỷ vectors."
184
+ "correct": 2,
185
+ "explanation": "If simple rules achieve 99% accuracy, ML adds unnecessary complexity. ML should be chosen when rules are too complex, patterns evolve over time, or data is unstructured not when simple heuristics already work."
138
186
  },
139
187
  {
140
188
  "id": 11,
141
- "question": "Vertex AI Workbench khác Colab Enterprise?",
189
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
190
+ "question": "You need to build a real-time feature engineering pipeline that processes streaming events and writes to Feature Store. Which GCP architecture is correct?",
142
191
  "options": [
143
- "Giống hệt nhau",
144
- "Workbench JupyterLab managed instances cho ML production, Colab Enterprise cho collaboration và exploration",
145
- "Workbench chỉ support R",
146
- "Colab Enterprise chỉ dùng miễn phí"
192
+ "Cloud Storage → Dataproc Batch → Feature Store",
193
+ "Pub/Sub Dataflow Streaming Vertex AI Feature Store",
194
+ "Cloud Functions BigQuery → Feature Store",
195
+ "Pub/Sub Cloud Composer Feature Store"
147
196
  ],
148
197
  "correct": 1,
149
- "explanation": "Workbench cung cấp JupyterLab managed instances với tích hợp sâu vào GCP services (BigQuery, GCS) cho production ML. Colab Enterprise thiên về collaboration, sharing exploration."
198
+ "explanation": "Pub/Sub ingests streaming events, Dataflow (Apache Beam) processes them in real-time for feature computation, and writes to Vertex AI Feature Store for online serving with low latency."
150
199
  },
151
200
  {
152
201
  "id": 12,
153
- "question": "Kỹ thuật nào giảm kích thước model để deploy trên edge devices?",
202
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
203
+ "question": "What is the PRIMARY advantage of Vertex AI Feature Store over storing features in a regular database?",
154
204
  "options": [
155
- "Tăng layers",
156
- "Quantization, pruning, knowledge distillation",
157
- "Tăng batch size",
158
- "Dùng thêm GPU"
205
+ "Lower storage cost",
206
+ "Training-serving consistency same features used during training are served in production with low-latency online serving",
207
+ "Better SQL query performance",
208
+ "Automatic model training"
159
209
  ],
160
210
  "correct": 1,
161
- "explanation": "Model compression: Quantization (giảm precision: FP32→INT8), Pruning (loại bỏ weights/neurons không quan trọng), Knowledge Distillation (teacher model dạy student model nhỏ hơn)."
211
+ "explanation": "Feature Store ensures training-serving consistency: features computed for training are exactly the same as those served online. It provides both batch (offline) and online serving modes with feature monitoring."
162
212
  },
163
213
  {
164
214
  "id": 13,
165
- "question": "Vertex AI GenAI Studio dùng để?",
215
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
216
+ "question": "When should you use Dataflow over Dataproc for data processing?",
166
217
  "options": [
167
- "Chỉ train model từ đầu",
168
- "Prototyping, testing, tuning Foundation Models (PaLM, Gemini) trên Google Cloud",
169
- "Quản billing",
170
- "Giám sát network"
218
+ "When you need to run existing Spark/Hadoop jobs",
219
+ "When you need a serverless, auto-scaling pipeline for both batch and streaming with Apache Beam",
220
+ "When you need to use PySpark",
221
+ "When you already have a Hadoop cluster"
171
222
  ],
172
223
  "correct": 1,
173
- "explanation": "GenAI Studio cung cấp UI API để thử nghiệm Foundation Models, prompt design, tuning, deploy không cần ML expertise sâu."
224
+ "explanation": "Dataflow is serverless (no cluster management), auto-scales, and uses Apache Beam for unified batch/streaming. Dataproc is for migrating existing Spark/Hadoop workloads that require a managed cluster."
174
225
  },
175
226
  {
176
227
  "id": 14,
177
- "question": "Khi data nhiều missing values, chiến lược nào phù hợp?",
228
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
229
+ "question": "A dataset has a categorical feature 'city' with 10,000 unique values. What is the BEST encoding strategy for a deep learning model?",
178
230
  "options": [
179
- "Luôn xoá rows missing values",
180
- "Tuỳ context: imputation (mean/median/mode, KNN, model-based), hoặc tạo indicator feature cho missingness",
181
- "Luôn fill bằng 0",
182
- "Bỏ qua train trực tiếp"
231
+ "One-hot encoding (creates 10,000 sparse columns)",
232
+ "Embedding layer that learns dense vector representations",
233
+ "Label encoding (assign integer 0-9999)",
234
+ "Remove the feature entirely"
183
235
  ],
184
236
  "correct": 1,
185
- "explanation": "Xử missing values tuỳ thuộc vào pattern (MCAR/MAR/MNAR): imputation thống (mean/median), model-based (KNN, MICE), hoặc thêm feature indicator. Xoá rows chỉ khi missing ít và MCAR."
237
+ "explanation": "High-cardinality categorical features should use embeddings in deep learning the model learns a dense, low-dimensional representation. One-hot creates extremely sparse, high-dimensional input. Label encoding implies false ordinal relationships."
186
238
  },
187
239
  {
188
240
  "id": 15,
189
- "question": "Continuous Training (CT) trong MLOps gì?",
241
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
242
+ "question": "You need to compute aggregate features (e.g., average order value per customer over 30 days) for training. Which tool is MOST efficient?",
243
+ "options": [
244
+ "Vertex AI Workbench with pandas",
245
+ "BigQuery SQL window functions, then export to Feature Store",
246
+ "Cloud Functions processing individual records",
247
+ "Dataproc with MapReduce"
248
+ ],
249
+ "correct": 1,
250
+ "explanation": "BigQuery excels at large-scale aggregate computations using SQL window/analytic functions. Results can be exported to Feature Store for serving. More efficient than processing row-by-row in notebooks."
251
+ },
252
+ {
253
+ "id": 16,
254
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
255
+ "question": "What does the TFX Transform component do?",
256
+ "options": [
257
+ "Trains the model",
258
+ "Applies feature transformations consistently during training AND serving using a saved transform graph",
259
+ "Validates the input data schema",
260
+ "Deploys the model to an endpoint"
261
+ ],
262
+ "correct": 1,
263
+ "explanation": "TFX Transform uses tf.Transform to create a transform graph that is applied both during training and serving — preventing training-serving skew in feature engineering (normalization, bucketization, vocabulary mapping, etc.)."
264
+ },
265
+ {
266
+ "id": 17,
267
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
268
+ "question": "How should you handle missing values in a feature used for a gradient boosted tree model (XGBoost)?",
269
+ "options": [
270
+ "Always impute with mean",
271
+ "Always drop rows",
272
+ "XGBoost handles missing values natively — it learns the best direction for missing values at each split",
273
+ "Replace with -999 to signal missingness"
274
+ ],
275
+ "correct": 2,
276
+ "explanation": "XGBoost natively handles missing values by learning the optimal split direction for missing entries during training. Forcing imputation may actually reduce performance. This is a key advantage of tree-based models."
277
+ },
278
+ {
279
+ "id": 18,
280
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
281
+ "question": "Pub/Sub guarantees at-least-once delivery. How does this affect an ML data pipeline?",
282
+ "options": [
283
+ "It has no impact on ML pipelines",
284
+ "Dataflow must handle duplicate messages to avoid counting features incorrectly",
285
+ "It guarantees exactly-once processing automatically",
286
+ "It means messages can be lost"
287
+ ],
288
+ "correct": 1,
289
+ "explanation": "At-least-once delivery means messages can be delivered multiple times. Without deduplication in Dataflow (using unique message IDs or idempotent operations), features like counts or sums can be inflated by duplicate processing."
290
+ },
291
+ {
292
+ "id": 19,
293
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
294
+ "question": "A feature has a right-skewed distribution (e.g., income). Which transformation is MOST appropriate before using it in a linear model?",
295
+ "options": [
296
+ "Min-max scaling to [0, 1]",
297
+ "Log transformation to reduce skewness, then standardization",
298
+ "One-hot encoding",
299
+ "No transformation needed"
300
+ ],
301
+ "correct": 1,
302
+ "explanation": "Right-skewed features benefit from log transformation (or Box-Cox) to make the distribution more normal-like, improving linear model performance. Standardization after log-transform ensures zero mean and unit variance."
303
+ },
304
+ {
305
+ "id": 20,
306
+ "domain": "Domain 2: Data Engineering & Feature Engineering",
307
+ "question": "Which Cloud Composer (Airflow) role is appropriate in an ML pipeline?",
308
+ "options": [
309
+ "Training the model directly",
310
+ "Orchestrating the end-to-end workflow: data ingestion → preprocessing → training → evaluation → deployment",
311
+ "Serving real-time predictions",
312
+ "Storing features"
313
+ ],
314
+ "correct": 1,
315
+ "explanation": "Cloud Composer (managed Apache Airflow) orchestrates ML pipeline tasks: scheduling data extraction, triggering Dataflow jobs, launching Vertex AI training, evaluating results, and conditional deployment — but doesn't execute ML compute itself."
316
+ },
317
+ {
318
+ "id": 21,
319
+ "domain": "Domain 3: Model Development on Vertex AI",
320
+ "question": "You need to train a custom PyTorch model on Vertex AI with 4 GPUs. Which approach is correct?",
321
+ "options": [
322
+ "Use AutoML with a custom container",
323
+ "Use Vertex AI Custom Training Job with a pre-built PyTorch container and specify 4 GPUs in machine config",
324
+ "Use BigQuery ML with PyTorch",
325
+ "Use Cloud Functions with GPU support"
326
+ ],
327
+ "correct": 1,
328
+ "explanation": "Vertex AI Custom Training supports pre-built containers for PyTorch (and TensorFlow, XGBoost, sklearn). You specify GPU type and count in the worker pool config. Distributed training is also supported with multiple workers."
329
+ },
330
+ {
331
+ "id": 22,
332
+ "domain": "Domain 3: Model Development on Vertex AI",
333
+ "question": "When should you use Vertex AI AutoML vs Custom Training?",
334
+ "options": [
335
+ "AutoML when you need to implement a custom loss function",
336
+ "Custom Training when you just need a quick baseline with minimal ML knowledge",
337
+ "AutoML when you have tabular/image/text data and want a strong model with minimal code",
338
+ "AutoML for any production model"
339
+ ],
340
+ "correct": 2,
341
+ "explanation": "AutoML excels when: data fits standard types (tabular, image, text, video), you want a strong baseline fast, or the team has limited ML expertise. Custom Training when: custom architecture, custom loss/metrics, specific frameworks, or research-level requirements."
342
+ },
343
+ {
344
+ "id": 23,
345
+ "domain": "Domain 3: Model Development on Vertex AI",
346
+ "question": "Which BigQuery ML statement creates and trains a logistic regression model?",
347
+ "options": [
348
+ "CREATE MODEL dataset.model OPTIONS(model_type='logistic_reg') AS SELECT ...",
349
+ "TRAIN MODEL dataset.model USING logistic_regression SELECT ...",
350
+ "CREATE ML_MODEL dataset.model AS LOGISTIC_REGRESSION SELECT ...",
351
+ "BUILD MODEL dataset.model OPTIONS(type='classification') AS SELECT ..."
352
+ ],
353
+ "correct": 0,
354
+ "explanation": "BigQuery ML uses CREATE MODEL with OPTIONS to specify model type. 'CREATE MODEL my_dataset.my_model OPTIONS(model_type='logistic_reg') AS SELECT features, label FROM ...' trains the model using SQL."
355
+ },
356
+ {
357
+ "id": 24,
358
+ "domain": "Domain 3: Model Development on Vertex AI",
359
+ "question": "What is the purpose of Vertex AI Hyperparameter Tuning?",
360
+ "options": [
361
+ "To automatically select the best features",
362
+ "To systematically search for the best hyperparameters (learning rate, batch size, etc.) using Bayesian optimization or grid search",
363
+ "To clean the training data",
364
+ "To deploy multiple model versions"
365
+ ],
366
+ "correct": 1,
367
+ "explanation": "Vertex AI Hyperparameter Tuning uses Vizier (Google's black-box optimization service) to search for optimal hyperparameters. It supports Bayesian optimization, grid search, and random search strategies."
368
+ },
369
+ {
370
+ "id": 25,
371
+ "domain": "Domain 3: Model Development on Vertex AI",
372
+ "question": "A model has high training accuracy but low test accuracy. What does this indicate?",
373
+ "options": [
374
+ "Underfitting",
375
+ "Overfitting — the model memorized training data but doesn't generalize",
376
+ "The model needs more features",
377
+ "The learning rate is too low"
378
+ ],
379
+ "correct": 1,
380
+ "explanation": "High train / low test performance = overfitting. Solutions: more training data, regularization (L1/L2, dropout), simpler model, early stopping, cross-validation, or data augmentation."
381
+ },
382
+ {
383
+ "id": 26,
384
+ "domain": "Domain 3: Model Development on Vertex AI",
385
+ "question": "Which technique is MOST effective for improving model performance on a small image dataset (<1,000 images)?",
386
+ "options": [
387
+ "Training a very deep custom CNN from scratch",
388
+ "Transfer learning — fine-tuning a pre-trained model (e.g., ResNet, EfficientNet) on your dataset",
389
+ "Using simpler models like logistic regression on raw pixels",
390
+ "Increasing the learning rate"
391
+ ],
392
+ "correct": 1,
393
+ "explanation": "Transfer learning leverages knowledge from models pre-trained on large datasets (ImageNet). Fine-tuning the last few layers on your small dataset typically achieves much better results than training from scratch."
394
+ },
395
+ {
396
+ "id": 27,
397
+ "domain": "Domain 3: Model Development on Vertex AI",
398
+ "question": "When training on Vertex AI, what is the difference between pre-built containers and custom containers?",
399
+ "options": [
400
+ "Pre-built containers are faster to train",
401
+ "Pre-built containers include common frameworks (TF, PyTorch, XGBoost, sklearn); custom containers let you install any dependencies and use any framework",
402
+ "Custom containers can only be used with AutoML",
403
+ "There is no difference"
404
+ ],
405
+ "correct": 1,
406
+ "explanation": "Pre-built containers are maintained by Google with popular ML frameworks pre-installed. Custom containers allow you to define your own Docker image with any dependencies, frameworks, or custom code."
407
+ },
408
+ {
409
+ "id": 28,
410
+ "domain": "Domain 3: Model Development on Vertex AI",
411
+ "question": "Which BigQuery ML model type should you use for customer segmentation WITHOUT labels?",
412
+ "options": [
413
+ "logistic_reg",
414
+ "kmeans",
415
+ "linear_reg",
416
+ "boosted_tree_classifier"
417
+ ],
418
+ "correct": 1,
419
+ "explanation": "Customer segmentation without labels is an unsupervised learning problem. K-means clustering (model_type='kmeans') groups similar customers into clusters based on feature similarity."
420
+ },
421
+ {
422
+ "id": 29,
423
+ "domain": "Domain 3: Model Development on Vertex AI",
424
+ "question": "Which regularization technique randomly disables neurons during training to prevent co-adaptation?",
425
+ "options": [
426
+ "L1 regularization (Lasso)",
427
+ "L2 regularization (Ridge)",
428
+ "Dropout",
429
+ "Batch normalization"
430
+ ],
431
+ "correct": 2,
432
+ "explanation": "Dropout randomly sets a fraction of neurons to zero during each training step, forcing the network to learn redundant representations. This prevents neurons from co-adapting and reduces overfitting."
433
+ },
434
+ {
435
+ "id": 30,
436
+ "domain": "Domain 3: Model Development on Vertex AI",
437
+ "question": "You want to track and compare multiple training experiments on Vertex AI. Which feature should you use?",
438
+ "options": [
439
+ "Vertex AI Endpoints",
440
+ "Vertex AI Experiments with Vertex AI TensorBoard",
441
+ "Vertex AI Feature Store",
442
+ "Vertex AI Model Registry only"
443
+ ],
444
+ "correct": 1,
445
+ "explanation": "Vertex AI Experiments logs hyperparameters, metrics, and artifacts for each run. Combined with Vertex AI TensorBoard, you can visualize training curves and compare runs side by side."
446
+ },
447
+ {
448
+ "id": 31,
449
+ "domain": "Domain 4: Model Deployment & MLOps",
450
+ "question": "You deployed a model to a Vertex AI Endpoint. How do you perform an A/B test between model v1 and v2?",
451
+ "options": [
452
+ "Deploy to two separate endpoints and use a load balancer",
453
+ "Use traffic splitting on a single endpoint — route 90% to v1 and 10% to v2",
454
+ "Deploy v2 to a Cloud Function",
455
+ "Use Cloud CDN for routing"
456
+ ],
457
+ "correct": 1,
458
+ "explanation": "Vertex AI Endpoints support traffic splitting: deploy multiple model versions to the same endpoint and configure traffic percentages. This enables A/B testing and canary deployments natively."
459
+ },
460
+ {
461
+ "id": 32,
462
+ "domain": "Domain 4: Model Deployment & MLOps",
463
+ "question": "What is the difference between online prediction and batch prediction on Vertex AI?",
464
+ "options": [
465
+ "Online is cheaper than batch",
466
+ "Online returns predictions synchronously with low latency via an endpoint; batch processes large datasets asynchronously and writes results to storage",
467
+ "Batch prediction is more accurate",
468
+ "Online prediction can only handle images"
469
+ ],
470
+ "correct": 1,
471
+ "explanation": "Online prediction uses a deployed endpoint for real-time, synchronous, low-latency responses. Batch prediction accepts input files (CSV, JSONL, BigQuery), processes them asynchronously, and writes output to GCS or BigQuery."
472
+ },
473
+ {
474
+ "id": 33,
475
+ "domain": "Domain 4: Model Deployment & MLOps",
476
+ "question": "Vertex AI Model Monitoring detects 'training-serving skew' on a feature. What does this mean?",
477
+ "options": [
478
+ "The model's accuracy has decreased",
479
+ "The feature's distribution in production serving requests differs significantly from training data distribution",
480
+ "The model is serving stale predictions",
481
+ "The endpoint is experiencing high latency"
482
+ ],
483
+ "correct": 1,
484
+ "explanation": "Training-serving skew means the statistical distribution of a feature during inference is significantly different from training time. This can cause degraded model performance and may indicate data pipeline issues or real-world distribution shifts."
485
+ },
486
+ {
487
+ "id": 34,
488
+ "domain": "Domain 4: Model Deployment & MLOps",
489
+ "question": "Which TFX component evaluates a trained model against baseline and decides whether to deploy?",
490
+ "options": [
491
+ "Trainer",
492
+ "Evaluator (with TFMA — TensorFlow Model Analysis)",
493
+ "Pusher",
494
+ "ExampleValidator"
495
+ ],
496
+ "correct": 1,
497
+ "explanation": "The Evaluator component uses TFMA (TensorFlow Model Analysis) to compute metrics, compare against a baseline model, and produce a 'blessed' or 'not blessed' decision. Only blessed models proceed to Pusher for deployment."
498
+ },
499
+ {
500
+ "id": 35,
501
+ "domain": "Domain 4: Model Deployment & MLOps",
502
+ "question": "Vertex AI Pipelines are built on which open-source framework?",
503
+ "options": [
504
+ "Apache Airflow",
505
+ "Kubeflow Pipelines (KFP) SDK",
506
+ "Apache Spark",
507
+ "Luigi"
508
+ ],
509
+ "correct": 1,
510
+ "explanation": "Vertex AI Pipelines uses the Kubeflow Pipelines (KFP) v2 SDK for pipeline definition. It also supports TFX pipelines. The pipeline runs on Google's serverless infrastructure — no Kubernetes cluster management required."
511
+ },
512
+ {
513
+ "id": 36,
514
+ "domain": "Domain 4: Model Deployment & MLOps",
515
+ "question": "A model endpoint needs to handle spiky traffic (many requests during business hours, near zero at night). Which configuration is BEST?",
516
+ "options": [
517
+ "Set a fixed number of replicas to handle peak load",
518
+ "Configure autoscaling with min replicas=1 and scaling based on CPU utilization or request count",
519
+ "Use batch prediction instead",
520
+ "Deploy to Cloud Run instead of Vertex AI"
521
+ ],
522
+ "correct": 1,
523
+ "explanation": "Vertex AI Endpoints support autoscaling based on metrics like CPU utilization or traffic. Setting min replicas ensures availability while auto-scaling up for peaks and down during quiet periods optimizes cost."
524
+ },
525
+ {
526
+ "id": 37,
527
+ "domain": "Domain 4: Model Deployment & MLOps",
528
+ "question": "In an MLOps CI/CD pipeline, what should trigger model retraining?",
529
+ "options": [
530
+ "Only manual trigger by data scientists",
531
+ "Data drift detected, scheduled cadence, code changes, or model performance degradation",
532
+ "Only when new features are added",
533
+ "Every time new data arrives regardless of quantity"
534
+ ],
535
+ "correct": 1,
536
+ "explanation": "Continuous Training triggers include: data drift or concept drift exceeding thresholds, scheduled retraining (daily/weekly), code changes (new features/model architecture), or performance monitoring alerts showing degradation."
537
+ },
538
+ {
539
+ "id": 38,
540
+ "domain": "Domain 4: Model Deployment & MLOps",
541
+ "question": "What is the purpose of Vertex AI Model Registry?",
542
+ "options": [
543
+ "To store training data",
544
+ "To manage model versions, track lineage, and organize models with metadata for governance",
545
+ "To run inference",
546
+ "To create features"
547
+ ],
548
+ "correct": 1,
549
+ "explanation": "Model Registry provides a central repository for model versioning, metadata management, lineage tracking (which data/pipeline produced which model), and governance — essential for production ML."
550
+ },
551
+ {
552
+ "id": 39,
553
+ "domain": "Domain 4: Model Deployment & MLOps",
554
+ "question": "You need to deploy a model for inference at the edge (on-device). Which GCP approach is correct?",
555
+ "options": [
556
+ "Deploy to Vertex AI Endpoint and call from the edge",
557
+ "Export the model, optimize with TFLite or ONNX, deploy via Edge Manager",
558
+ "Use BigQuery ML for edge inference",
559
+ "Use Dataflow for edge processing"
560
+ ],
561
+ "correct": 1,
562
+ "explanation": "Edge deployment requires model optimization (TFLite for TensorFlow, ONNX for framework-agnostic) to reduce model size and latency. Vertex AI supports edge deployment with model optimization tools."
563
+ },
564
+ {
565
+ "id": 40,
566
+ "domain": "Domain 4: Model Deployment & MLOps",
567
+ "question": "Which tool in the Vertex AI ecosystem provides experiment tracking with visualization of training curves?",
568
+ "options": [
569
+ "Vertex AI Feature Store",
570
+ "Vertex AI TensorBoard",
571
+ "Vertex AI Matching Engine",
572
+ "Cloud Monitoring"
573
+ ],
574
+ "correct": 1,
575
+ "explanation": "Vertex AI TensorBoard is a managed TensorBoard instance that visualizes training metrics (loss curves, accuracy), model graphs, embeddings, and profiling data. It integrates with Vertex AI Experiments."
576
+ },
577
+ {
578
+ "id": 41,
579
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
580
+ "question": "Which Google Cloud service provides model explainability with feature attributions for Vertex AI predictions?",
581
+ "options": [
582
+ "Cloud Audit Logs",
583
+ "Vertex Explainable AI (using Integrated Gradients, SHAP, or sampled Shapley)",
584
+ "Cloud DLP",
585
+ "Security Command Center"
586
+ ],
587
+ "correct": 1,
588
+ "explanation": "Vertex Explainable AI provides feature attributions for predictions using methods like Integrated Gradients (for neural networks), sampled Shapley, and XRAI (for image models) — showing which features most influenced each prediction."
589
+ },
590
+ {
591
+ "id": 42,
592
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
593
+ "question": "A loan approval model is observed to have significantly different approval rates across racial groups. Which Responsible AI principle is being violated?",
594
+ "options": [
595
+ "Privacy",
596
+ "Fairness — the model shows demographic bias",
597
+ "Transparency",
598
+ "Accountability"
599
+ ],
600
+ "correct": 1,
601
+ "explanation": "Fairness requires that ML systems don't create or reinforce bias against protected groups. Disparate approval rates across racial groups indicate bias that must be investigated using fairness metrics and mitigated."
602
+ },
603
+ {
604
+ "id": 43,
605
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
606
+ "question": "How should you protect Vertex AI training data at rest in Google Cloud Storage?",
607
+ "options": [
608
+ "Rely on default encryption only",
609
+ "Use Customer-Managed Encryption Keys (CMEK) with Cloud KMS for additional control",
610
+ "Encrypt data manually before upload and decrypt in training code",
611
+ "Use a VPN connection"
612
+ ],
613
+ "correct": 1,
614
+ "explanation": "GCS encrypts at rest by default (Google-managed keys). CMEK gives you control over encryption keys via Cloud KMS — you can rotate, disable, or revoke keys. This is required for compliance-sensitive ML workloads."
615
+ },
616
+ {
617
+ "id": 44,
618
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
619
+ "question": "How do you restrict which users can deploy models to Vertex AI Endpoints in a production project?",
620
+ "options": [
621
+ "Use service account keys shared across the team",
622
+ "Use IAM roles — grant roles/aiplatform.user for training but only roles/aiplatform.admin or custom roles for deployment",
623
+ "Use IP allowlisting",
624
+ "Restrict by VPC network"
625
+ ],
626
+ "correct": 1,
627
+ "explanation": "IAM roles control who can perform what actions. Separation of duties: data scientists get training permissions, while only authorized users/service accounts get deployment permissions. Custom IAM roles allow fine-grained control."
628
+ },
629
+ {
630
+ "id": 45,
631
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
632
+ "question": "What is VPC Service Controls (VPC-SC) used for in ML workloads?",
633
+ "options": [
634
+ "Speeding up network traffic",
635
+ "Creating a security perimeter around GCP services to prevent data exfiltration from Vertex AI, BigQuery, GCS",
636
+ "Managing DNS resolution",
637
+ "Load balancing inference traffic"
638
+ ],
639
+ "correct": 1,
640
+ "explanation": "VPC-SC creates a security perimeter (service perimeter) that restricts data movement in/out of specified GCP services. This prevents exfiltration of sensitive ML data from BigQuery, GCS, and Vertex AI."
641
+ },
642
+ {
643
+ "id": 46,
644
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
645
+ "question": "Differential privacy in ML training means:",
646
+ "options": [
647
+ "Encrypting the model weights",
648
+ "Adding calibrated noise during training so the model doesn't memorize or reveal individual training examples",
649
+ "Using private VPCs for training",
650
+ "Restricting access to training logs"
651
+ ],
652
+ "correct": 1,
653
+ "explanation": "Differential privacy adds mathematical noise to gradient updates during training, ensuring no individual training record can be reconstructed or inferred from the model — protecting user privacy while maintaining model utility."
654
+ },
655
+ {
656
+ "id": 47,
657
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
658
+ "question": "Federated Learning allows:",
659
+ "options": [
660
+ "Training one large model on a single GPU",
661
+ "Training a shared model across multiple devices/organizations without centralizing raw data",
662
+ "Faster inference on edge devices",
663
+ "Real-time feature serving"
664
+ ],
665
+ "correct": 1,
666
+ "explanation": "Federated Learning trains a model across decentralized data sources (devices, hospitals, organizations). Each participant trains locally and shares model updates (not raw data), preserving data privacy."
667
+ },
668
+ {
669
+ "id": 48,
670
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
671
+ "question": "Which tool helps detect PII (Personally Identifiable Information) in text data before using it for ML training?",
672
+ "options": [
673
+ "Vertex AI Feature Store",
674
+ "Cloud Data Loss Prevention (Cloud DLP)",
675
+ "Cloud Armor",
676
+ "Cloud IDS"
677
+ ],
678
+ "correct": 1,
679
+ "explanation": "Cloud DLP inspects, classifies, and de-identifies sensitive data (PII, PHI) in text, images, and structured data. Use it to scan training data for PII before feeding it into ML pipelines."
680
+ },
681
+ {
682
+ "id": 49,
683
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
684
+ "question": "During the exam, you see a question about choosing between Dataflow, Dataproc, and Cloud Composer. Which decision framework is correct?",
685
+ "options": [
686
+ "Always choose Dataflow because it's serverless",
687
+ "Dataflow for new batch/streaming pipelines (Apache Beam); Dataproc for existing Spark/Hadoop migrations; Cloud Composer for workflow orchestration (Airflow)",
688
+ "Always choose Dataproc because it's cheaper",
689
+ "Cloud Composer for all data processing"
690
+ ],
691
+ "correct": 1,
692
+ "explanation": "Decision framework: Dataflow = serverless Apache Beam for new ETL (batch/streaming). Dataproc = managed Spark/Hadoop for existing workloads. Cloud Composer = managed Airflow for orchestrating multi-step workflows (not data processing itself)."
693
+ },
694
+ {
695
+ "id": 50,
696
+ "domain": "Domain 5: Responsible AI, Security & Exam Strategy",
697
+ "question": "A Vertex AI model is making predictions that seem incorrect but there are no alerts. What should you add to your monitoring setup?",
190
698
  "options": [
191
- "Train model chỉ một lần",
192
- "Tự động retrain model khi phát hiện trigger (data drift, schedule, hoặc performance degradation)",
193
- "Train model thủ công hàng tuần",
194
- "Chỉ dùng cho deep learning"
699
+ "More prediction logs in Cloud Logging only",
700
+ "Vertex AI Model Monitoring with skew/drift detection thresholds and alerting to Cloud Monitoring",
701
+ "Increase the number of endpoint replicas",
702
+ "Add more training data"
195
703
  ],
196
704
  "correct": 1,
197
- "explanation": "Continuous Training tự động kích hoạt retrain pipeline khi: data mới đến (scheduled), data drift vượt threshold, hoặc model performance giảm đảm bảo model luôn fresh."
705
+ "explanation": "Vertex AI Model Monitoring should be configured with: (1) training-serving skew detection, (2) prediction drift detection, (3) threshold configurations per feature, and (4) alerting via Cloud Monitoring for proactive issue detection."
198
706
  }
199
707
  ]
200
708
  }