cv-study-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1091 @@
1
+ PRACTICE = [
2
+ {
3
+ "id": 1,
4
+ "task": "Предобработка изображений и анализ цветовых пространств",
5
+ "description": "Resize, normalization, RGB/HSV/Grayscale, histograms and simple statistics.",
6
+ "code": r'''
7
+ # Task 1. Image preprocessing and color-space analysis.
8
+ # pip install opencv-python pillow matplotlib scikit-image
9
+
10
+ import cv2
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ from PIL import Image
14
+
15
+
16
+ def load_rgb(image_path=None):
17
+ if image_path is None:
18
+ from skimage import data
19
+ return data.astronaut() # RGB uint8 sample
20
+ pil_img = Image.open(image_path).convert("RGB")
21
+ return np.array(pil_img)
22
+
23
+
24
+ img_rgb = load_rgb(None) # replace None with "your_image.jpg"
25
+ img_resized = cv2.resize(img_rgb, (224, 224), interpolation=cv2.INTER_AREA)
26
+ img_float = img_resized.astype(np.float32) / 255.0
27
+ img_hsv = cv2.cvtColor(img_resized, cv2.COLOR_RGB2HSV)
28
+ img_gray = cv2.cvtColor(img_resized, cv2.COLOR_RGB2GRAY)
29
+
30
+ print("shape:", img_resized.shape)
31
+ print("dtype:", img_resized.dtype)
32
+ print("RGB mean:", img_resized.mean(axis=(0, 1)))
33
+ print("RGB std:", img_resized.std(axis=(0, 1)))
34
+ print("normalized range:", img_float.min(), img_float.max())
35
+
36
+ fig, ax = plt.subplots(2, 3, figsize=(12, 7))
37
+ ax[0, 0].imshow(img_rgb)
38
+ ax[0, 0].set_title("Original RGB")
39
+ ax[0, 1].imshow(img_resized)
40
+ ax[0, 1].set_title("Resized 224x224")
41
+ ax[0, 2].imshow(img_gray, cmap="gray")
42
+ ax[0, 2].set_title("Grayscale")
43
+
44
+ colors = ["r", "g", "b"]
45
+ for c, color in enumerate(colors):
46
+ ax[1, 0].hist(img_resized[:, :, c].ravel(), bins=256, range=(0, 255), color=color, alpha=0.5)
47
+ ax[1, 0].set_title("RGB channel histograms")
48
+
49
+ ax[1, 1].hist(img_gray.ravel(), bins=256, range=(0, 255), color="gray")
50
+ ax[1, 1].set_title("Brightness histogram")
51
+
52
+ h, s, v = cv2.split(img_hsv)
53
+ ax[1, 2].hist(h.ravel(), bins=180, range=(0, 179), color="orange", alpha=0.7, label="H")
54
+ ax[1, 2].hist(s.ravel(), bins=256, range=(0, 255), color="purple", alpha=0.4, label="S")
55
+ ax[1, 2].hist(v.ravel(), bins=256, range=(0, 255), color="black", alpha=0.3, label="V")
56
+ ax[1, 2].legend()
57
+ ax[1, 2].set_title("HSV histograms")
58
+
59
+ for a in ax.ravel():
60
+ a.axis("off") if a in ax[0] else None
61
+ plt.tight_layout()
62
+ plt.show()
63
+ ''',
64
+ },
65
+ {
66
+ "id": 2,
67
+ "task": "Подавление шума и фильтрация изображений",
68
+ "description": "Gaussian noise, salt-and-pepper noise, Gaussian/median/bilateral filtering, PSNR and SSIM.",
69
+ "code": r'''
70
+ # Task 2. Noise suppression and filtering.
71
+ # pip install opencv-python matplotlib scikit-image
72
+
73
+ import cv2
74
+ import numpy as np
75
+ import matplotlib.pyplot as plt
76
+ from skimage import data
77
+ from skimage.metrics import peak_signal_noise_ratio, structural_similarity
78
+
79
+
80
+ img = data.astronaut()
81
+ img = cv2.resize(img, (256, 256), interpolation=cv2.INTER_AREA)
82
+
83
+
84
+ def gaussian_noise(image, sigma=25):
85
+ noise = np.random.normal(0, sigma, image.shape)
86
+ return np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8)
87
+
88
+
89
+ def salt_pepper_noise(image, amount=0.04):
90
+ noisy = image.copy()
91
+ h, w = noisy.shape[:2]
92
+ n = int(amount * h * w)
93
+ ys = np.random.randint(0, h, n)
94
+ xs = np.random.randint(0, w, n)
95
+ half = n // 2
96
+ noisy[ys[:half], xs[:half]] = 0
97
+ noisy[ys[half:], xs[half:]] = 255
98
+ return noisy
99
+
100
+
101
+ def score(reference, test):
102
+ psnr = peak_signal_noise_ratio(reference, test, data_range=255)
103
+ ssim = structural_similarity(reference, test, data_range=255, channel_axis=-1)
104
+ return psnr, ssim
105
+
106
+
107
+ noisy_gauss = gaussian_noise(img, sigma=25)
108
+ noisy_sp = salt_pepper_noise(img, amount=0.05)
109
+
110
+ variants = {
111
+ "gaussian noise": noisy_gauss,
112
+ "gaussian blur": cv2.GaussianBlur(noisy_gauss, (5, 5), 0),
113
+ "median filter": cv2.medianBlur(noisy_sp, 5),
114
+ "bilateral filter": cv2.bilateralFilter(noisy_gauss, d=9, sigmaColor=75, sigmaSpace=75),
115
+ }
116
+
117
+ for name, image in variants.items():
118
+ psnr, ssim = score(img, image)
119
+ print(f"{name:16s} PSNR={psnr:6.2f}, SSIM={ssim:.4f}")
120
+
121
+ fig, ax = plt.subplots(2, 3, figsize=(12, 8))
122
+ show = [("original", img), ("gaussian noise", noisy_gauss), ("salt-pepper", noisy_sp),
123
+ ("gaussian blur", variants["gaussian blur"]), ("median", variants["median filter"]),
124
+ ("bilateral", variants["bilateral filter"])]
125
+ for a, (title, image) in zip(ax.ravel(), show):
126
+ a.imshow(image)
127
+ a.set_title(title)
128
+ a.axis("off")
129
+ plt.tight_layout()
130
+ plt.show()
131
+ ''',
132
+ },
133
+ {
134
+ "id": 3,
135
+ "task": "Обнаружение границ и выделение контуров",
136
+ "description": "Sobel, Laplacian, Canny, threshold selection and contours.",
137
+ "code": r'''
138
+ # Task 3. Edges and contours.
139
+ # pip install opencv-python matplotlib scikit-image
140
+
141
+ import cv2
142
+ import numpy as np
143
+ import matplotlib.pyplot as plt
144
+ from skimage import data
145
+
146
+
147
+ img_rgb = data.camera()
148
+ if img_rgb.ndim == 2:
149
+ gray = img_rgb
150
+ else:
151
+ gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
152
+
153
+ gray = cv2.resize(gray, (320, 320), interpolation=cv2.INTER_AREA)
154
+ blur = cv2.GaussianBlur(gray, (5, 5), 0)
155
+
156
+ sobel_x = cv2.Sobel(blur, cv2.CV_64F, 1, 0, ksize=3)
157
+ sobel_y = cv2.Sobel(blur, cv2.CV_64F, 0, 1, ksize=3)
158
+ sobel_mag = cv2.magnitude(sobel_x, sobel_y)
159
+ sobel_mag = cv2.normalize(sobel_mag, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
160
+
161
+ laplacian = cv2.Laplacian(blur, cv2.CV_64F)
162
+ laplacian = cv2.convertScaleAbs(laplacian)
163
+
164
+ threshold_pairs = [(30, 90), (50, 150), (100, 200)]
165
+ canny_maps = [cv2.Canny(blur, lo, hi) for lo, hi in threshold_pairs]
166
+
167
+ # Contours from the middle Canny result.
168
+ contours, _ = cv2.findContours(canny_maps[1], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
169
+ canvas = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
170
+ cv2.drawContours(canvas, contours, -1, (255, 0, 0), 1)
171
+ print("number of contours:", len(contours))
172
+
173
+ fig, ax = plt.subplots(2, 4, figsize=(14, 7))
174
+ items = [("gray", gray), ("sobel", sobel_mag), ("laplacian", laplacian)]
175
+ items += [(f"canny {lo}-{hi}", c) for (lo, hi), c in zip(threshold_pairs, canny_maps)]
176
+ items += [("contours", canvas)]
177
+
178
+ for a, (title, image) in zip(ax.ravel(), items):
179
+ if image.ndim == 2:
180
+ a.imshow(image, cmap="gray")
181
+ else:
182
+ a.imshow(image)
183
+ a.set_title(title)
184
+ a.axis("off")
185
+
186
+ for a in ax.ravel()[len(items):]:
187
+ a.axis("off")
188
+ plt.tight_layout()
189
+ plt.show()
190
+ ''',
191
+ },
192
+ {
193
+ "id": 4,
194
+ "task": "Классификация изображений традиционными методами МО",
195
+ "description": "HOG and LBP features, k-NN/SVM classifiers, metrics and confusion matrix.",
196
+ "code": r'''
197
+ # Task 4. Classical image classification with HOG/LBP + k-NN/SVM.
198
+ # pip install scikit-image scikit-learn matplotlib
199
+
200
+ import numpy as np
201
+ import matplotlib.pyplot as plt
202
+ from skimage.feature import hog, local_binary_pattern
203
+ from sklearn.datasets import load_digits
204
+ from sklearn.model_selection import train_test_split
205
+ from sklearn.preprocessing import StandardScaler
206
+ from sklearn.pipeline import make_pipeline
207
+ from sklearn.neighbors import KNeighborsClassifier
208
+ from sklearn.svm import SVC
209
+ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
210
+
211
+
212
+ digits = load_digits()
213
+ X_img = digits.images # 8x8 grayscale images
214
+ y = digits.target
215
+
216
+
217
+ def extract_features(image):
218
+ image = image.astype(np.float32) / 16.0
219
+ hog_feat = hog(image, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(1, 1), feature_vector=True)
220
+ lbp = local_binary_pattern(image, P=8, R=1, method="uniform")
221
+ lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10), density=True)
222
+ return np.hstack([hog_feat, lbp_hist])
223
+
224
+
225
+ X = np.vstack([extract_features(im) for im in X_img])
226
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
227
+
228
+ models = {
229
+ "kNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)),
230
+ "SVM": make_pipeline(StandardScaler(), SVC(kernel="rbf", C=10, gamma="scale")),
231
+ }
232
+
233
+ for name, model in models.items():
234
+ model.fit(X_train, y_train)
235
+ pred = model.predict(X_test)
236
+ print("\n", name)
237
+ print("accuracy:", accuracy_score(y_test, pred))
238
+ print(classification_report(y_test, pred))
239
+ ConfusionMatrixDisplay.from_predictions(y_test, pred, cmap="Blues")
240
+ plt.title(f"{name} confusion matrix")
241
+ plt.show()
242
+ ''',
243
+ },
244
+ {
245
+ "id": 5,
246
+ "task": "Сравнение ИНС и классических алгоритмов",
247
+ "description": "Compare SVM + HOG and MLP by accuracy, train/inference time and robustness to noise.",
248
+ "code": r'''
249
+ # Task 5. SVM+HOG vs MLP: quality, time, robustness.
250
+ # pip install scikit-image scikit-learn matplotlib
251
+
252
+ import time
253
+ import numpy as np
254
+ from skimage.feature import hog
255
+ from sklearn.datasets import load_digits
256
+ from sklearn.model_selection import train_test_split
257
+ from sklearn.pipeline import make_pipeline
258
+ from sklearn.preprocessing import StandardScaler
259
+ from sklearn.svm import LinearSVC
260
+ from sklearn.neural_network import MLPClassifier
261
+ from sklearn.metrics import accuracy_score
262
+
263
+
264
+ digits = load_digits()
265
+ images = digits.images.astype(np.float32) / 16.0
266
+ y = digits.target
267
+
268
+
269
+ def hog_features(batch):
270
+ return np.vstack([
271
+ hog(im, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(1, 1), feature_vector=True)
272
+ for im in batch
273
+ ])
274
+
275
+
276
+ X_hog = hog_features(images)
277
+ X_flat = images.reshape(len(images), -1)
278
+ idx_train, idx_test = train_test_split(np.arange(len(images)), test_size=0.25, stratify=y, random_state=42)
279
+
280
+ svm = make_pipeline(StandardScaler(), LinearSVC(C=1.0, max_iter=5000))
281
+ mlp = make_pipeline(
282
+ StandardScaler(),
283
+ MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", max_iter=80, random_state=42)
284
+ )
285
+
286
+ experiments = [
287
+ ("SVM + HOG", svm, X_hog[idx_train], X_hog[idx_test]),
288
+ ("MLP + pixels", mlp, X_flat[idx_train], X_flat[idx_test]),
289
+ ]
290
+
291
+ for name, model, Xtr, Xte in experiments:
292
+ t0 = time.perf_counter()
293
+ model.fit(Xtr, y[idx_train])
294
+ train_time = time.perf_counter() - t0
295
+ t0 = time.perf_counter()
296
+ pred = model.predict(Xte)
297
+ infer_time = time.perf_counter() - t0
298
+ print(f"{name:14s} acc={accuracy_score(y[idx_test], pred):.4f}, "
299
+ f"train={train_time:.3f}s, inference={infer_time:.4f}s")
300
+
301
+
302
+ def add_noise(batch, sigma=0.25):
303
+ return np.clip(batch + np.random.normal(0, sigma, batch.shape), 0, 1)
304
+
305
+
306
+ noisy = add_noise(images[idx_test], sigma=0.25)
307
+ svm_pred = svm.predict(hog_features(noisy))
308
+ mlp_pred = mlp.predict(noisy.reshape(len(noisy), -1))
309
+ print("noise robustness:")
310
+ print("SVM + HOG:", accuracy_score(y[idx_test], svm_pred))
311
+ print("MLP + pixels:", accuracy_score(y[idx_test], mlp_pred))
312
+ ''',
313
+ },
314
+ {
315
+ "id": 6,
316
+ "task": "Классификация изображений с использованием CNN",
317
+ "description": "PyTorch CNN with two convolutional layers, pooling, loss/accuracy plots and overfitting check.",
318
+ "code": r'''
319
+ # Task 6. CNN image classification in PyTorch.
320
+ # pip install torch torchvision matplotlib
321
+
322
+ import torch
323
+ from torch import nn
324
+ from torch.utils.data import DataLoader
325
+ from torchvision import datasets, transforms
326
+ import matplotlib.pyplot as plt
327
+
328
+
329
+ device = "cuda" if torch.cuda.is_available() else "cpu"
330
+ transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
331
+
332
+ train_ds = datasets.FashionMNIST(root="data", train=True, download=True, transform=transform)
333
+ test_ds = datasets.FashionMNIST(root="data", train=False, download=True, transform=transform)
334
+ train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2)
335
+ test_loader = DataLoader(test_ds, batch_size=256, shuffle=False, num_workers=2)
336
+
337
+
338
+ class SmallCNN(nn.Module):
339
+ def __init__(self, num_classes=10):
340
+ super().__init__()
341
+ self.features = nn.Sequential(
342
+ nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
343
+ nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
344
+ )
345
+ self.classifier = nn.Sequential(
346
+ nn.Flatten(),
347
+ nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Dropout(0.3),
348
+ nn.Linear(128, num_classes),
349
+ )
350
+
351
+ def forward(self, x):
352
+ return self.classifier(self.features(x))
353
+
354
+
355
+ model = SmallCNN().to(device)
356
+ criterion = nn.CrossEntropyLoss()
357
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
358
+
359
+
360
+ def run_epoch(loader, train=True):
361
+ model.train(train)
362
+ total_loss, correct, total = 0.0, 0, 0
363
+ for x, y in loader:
364
+ x, y = x.to(device), y.to(device)
365
+ with torch.set_grad_enabled(train):
366
+ logits = model(x)
367
+ loss = criterion(logits, y)
368
+ if train:
369
+ optimizer.zero_grad()
370
+ loss.backward()
371
+ optimizer.step()
372
+ total_loss += loss.item() * x.size(0)
373
+ correct += (logits.argmax(1) == y).sum().item()
374
+ total += x.size(0)
375
+ return total_loss / total, correct / total
376
+
377
+
378
+ history = {"train_loss": [], "test_loss": [], "train_acc": [], "test_acc": []}
379
+ for epoch in range(5):
380
+ tr_loss, tr_acc = run_epoch(train_loader, train=True)
381
+ te_loss, te_acc = run_epoch(test_loader, train=False)
382
+ history["train_loss"].append(tr_loss)
383
+ history["test_loss"].append(te_loss)
384
+ history["train_acc"].append(tr_acc)
385
+ history["test_acc"].append(te_acc)
386
+ print(f"epoch {epoch+1}: train acc={tr_acc:.3f}, test acc={te_acc:.3f}")
387
+
388
+ plt.figure(figsize=(10, 4))
389
+ plt.subplot(1, 2, 1)
390
+ plt.plot(history["train_loss"], label="train")
391
+ plt.plot(history["test_loss"], label="test")
392
+ plt.title("Loss")
393
+ plt.legend()
394
+ plt.subplot(1, 2, 2)
395
+ plt.plot(history["train_acc"], label="train")
396
+ plt.plot(history["test_acc"], label="test")
397
+ plt.title("Accuracy")
398
+ plt.legend()
399
+ plt.show()
400
+ ''',
401
+ },
402
+ {
403
+ "id": 7,
404
+ "task": "Transfer Learning и дообучение моделей",
405
+ "description": "Fine-tune ResNet/VGG on a custom ImageFolder dataset and compare with training from scratch.",
406
+ "code": r'''
407
+ # Task 7. Transfer learning with torchvision.
408
+ # Dataset format:
409
+ # data/train/class_a/*.jpg, data/train/class_b/*.jpg
410
+ # data/val/class_a/*.jpg, data/val/class_b/*.jpg
411
+ # pip install torch torchvision
412
+
413
+ import copy
414
+ import time
415
+ import torch
416
+ from torch import nn
417
+ from torch.utils.data import DataLoader
418
+ from torchvision import datasets, transforms, models
419
+
420
+
421
+ device = "cuda" if torch.cuda.is_available() else "cpu"
422
+ data_dir = "data" # change to your dataset root
423
+
424
+ weights = models.ResNet50_Weights.DEFAULT
425
+ preprocess = weights.transforms()
426
+ train_tf = transforms.Compose([
427
+ transforms.Resize((256, 256)),
428
+ transforms.RandomResizedCrop(224),
429
+ transforms.RandomHorizontalFlip(),
430
+ transforms.ToTensor(),
431
+ transforms.Normalize(mean=weights.meta["mean"], std=weights.meta["std"]),
432
+ ])
433
+ val_tf = preprocess
434
+
435
+ train_ds = datasets.ImageFolder(f"{data_dir}/train", transform=train_tf)
436
+ val_ds = datasets.ImageFolder(f"{data_dir}/val", transform=val_tf)
437
+ train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
438
+ val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=2)
439
+ num_classes = len(train_ds.classes)
440
+
441
+
442
+ def make_resnet50(pretrained=True, freeze_backbone=True):
443
+ model = models.resnet50(weights=weights if pretrained else None)
444
+ if freeze_backbone:
445
+ for p in model.parameters():
446
+ p.requires_grad = False
447
+ model.fc = nn.Linear(model.fc.in_features, num_classes)
448
+ return model.to(device)
449
+
450
+
451
+ def train_model(model, epochs=3, lr=1e-3):
452
+ criterion = nn.CrossEntropyLoss()
453
+ optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
454
+ best_acc, best_state = 0.0, copy.deepcopy(model.state_dict())
455
+ for epoch in range(epochs):
456
+ for phase, loader in [("train", train_loader), ("val", val_loader)]:
457
+ model.train(phase == "train")
458
+ correct, total, loss_sum = 0, 0, 0.0
459
+ for x, y in loader:
460
+ x, y = x.to(device), y.to(device)
461
+ with torch.set_grad_enabled(phase == "train"):
462
+ logits = model(x)
463
+ loss = criterion(logits, y)
464
+ if phase == "train":
465
+ optimizer.zero_grad()
466
+ loss.backward()
467
+ optimizer.step()
468
+ correct += (logits.argmax(1) == y).sum().item()
469
+ total += y.numel()
470
+ loss_sum += loss.item() * y.numel()
471
+ acc = correct / total
472
+ print(f"epoch {epoch+1} {phase}: loss={loss_sum/total:.4f}, acc={acc:.4f}")
473
+ if phase == "val" and acc > best_acc:
474
+ best_acc, best_state = acc, copy.deepcopy(model.state_dict())
475
+ model.load_state_dict(best_state)
476
+ return best_acc
477
+
478
+
479
+ start = time.perf_counter()
480
+ transfer_model = make_resnet50(pretrained=True, freeze_backbone=True)
481
+ transfer_acc = train_model(transfer_model, epochs=3, lr=1e-3)
482
+ print("transfer learning best acc:", transfer_acc, "time:", time.perf_counter() - start)
483
+
484
+ # Baseline from scratch. Usually needs more data/epochs.
485
+ scratch_model = make_resnet50(pretrained=False, freeze_backbone=False)
486
+ scratch_acc = train_model(scratch_model, epochs=3, lr=1e-4)
487
+ print("scratch best acc:", scratch_acc)
488
+ ''',
489
+ },
490
+ {
491
+ "id": 8,
492
+ "task": "Детекция объектов на изображениях",
493
+ "description": "YOLO inference on images/video and simple IoU/mAP placeholders.",
494
+ "code": r'''
495
+ # Task 8. Object detection with YOLO.
496
+ # pip install ultralytics opencv-python matplotlib
497
+
498
+ from ultralytics import YOLO
499
+ import cv2
500
+ import numpy as np
501
+ import matplotlib.pyplot as plt
502
+
503
+
504
+ model = YOLO("yolo11n.pt") # or "yolov8n.pt" if that is what your environment has
505
+
506
+ # Image inference.
507
+ image_path = "image.jpg" # replace with your file
508
+ results = model.predict(source=image_path, conf=0.25, save=True)
509
+ result = results[0]
510
+ print(result.boxes)
511
+
512
+ annotated = result.plot() # BGR array
513
+ plt.imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
514
+ plt.axis("off")
515
+ plt.show()
516
+
517
+ # Video inference.
518
+ # model.predict(source="video.mp4", conf=0.25, save=True, stream=False)
519
+
520
+
521
+ def iou_xyxy(box_a, box_b):
522
+ ax1, ay1, ax2, ay2 = box_a
523
+ bx1, by1, bx2, by2 = box_b
524
+ ix1, iy1 = max(ax1, bx1), max(ay1, by1)
525
+ ix2, iy2 = min(ax2, bx2), min(ay2, by2)
526
+ inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
527
+ area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
528
+ area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
529
+ union = area_a + area_b - inter
530
+ return inter / union if union else 0.0
531
+
532
+
533
+ # Example: evaluate one predicted box against one ground-truth box.
534
+ pred_box = np.array([50, 40, 180, 200])
535
+ true_box = np.array([60, 50, 170, 210])
536
+ print("IoU example:", iou_xyxy(pred_box, true_box))
537
+
538
+ # For real mAP use dataset YAML and validation:
539
+ # metrics = model.val(data="coco8.yaml")
540
+ # print(metrics.box.map, metrics.box.map50)
541
+ ''',
542
+ },
543
+ {
544
+ "id": 9,
545
+ "task": "Семантическая сегментация изображений",
546
+ "description": "Threshold segmentation, watershed, small U-Net and Dice/IoU metrics.",
547
+ "code": r'''
548
+ # Task 9. Semantic segmentation: threshold, watershed, U-Net metrics.
549
+ # pip install opencv-python scikit-image matplotlib tensorflow
550
+
551
+ import cv2
552
+ import numpy as np
553
+ import matplotlib.pyplot as plt
554
+ from skimage import data
555
+ from skimage.segmentation import watershed
556
+ from skimage.feature import peak_local_max
557
+ from scipy import ndimage as ndi
558
+
559
+
560
+ gray = data.coins()
561
+ gray = cv2.resize(gray, (256, 256), interpolation=cv2.INTER_AREA)
562
+ blur = cv2.GaussianBlur(gray, (5, 5), 0)
563
+ _, otsu = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
564
+
565
+ distance = ndi.distance_transform_edt(otsu)
566
+ coords = peak_local_max(distance, footprint=np.ones((15, 15)), labels=otsu)
567
+ markers = np.zeros(distance.shape, dtype=np.int32)
568
+ markers[tuple(coords.T)] = np.arange(1, len(coords) + 1)
569
+ labels = watershed(-distance, markers, mask=otsu.astype(bool))
570
+
571
+
572
+ def dice_coef(y_true, y_pred, eps=1e-7):
573
+ y_true = y_true.astype(bool)
574
+ y_pred = y_pred.astype(bool)
575
+ return (2 * np.logical_and(y_true, y_pred).sum() + eps) / (y_true.sum() + y_pred.sum() + eps)
576
+
577
+
578
+ def iou_mask(y_true, y_pred, eps=1e-7):
579
+ y_true = y_true.astype(bool)
580
+ y_pred = y_pred.astype(bool)
581
+ inter = np.logical_and(y_true, y_pred).sum()
582
+ union = np.logical_or(y_true, y_pred).sum()
583
+ return (inter + eps) / (union + eps)
584
+
585
+
586
+ print("Example Dice self-check:", dice_coef(otsu > 0, otsu > 0))
587
+ print("Example IoU self-check:", iou_mask(otsu > 0, otsu > 0))
588
+
589
+ fig, ax = plt.subplots(1, 4, figsize=(14, 4))
590
+ for a, (title, image) in zip(ax, [("gray", gray), ("otsu", otsu), ("distance", distance), ("watershed", labels)]):
591
+ a.imshow(image, cmap="gray" if title != "watershed" else "nipy_spectral")
592
+ a.set_title(title)
593
+ a.axis("off")
594
+ plt.show()
595
+
596
+ # Minimal U-Net skeleton for real image/mask tensors X_train, Y_train.
597
+ import tensorflow as tf
598
+ from tensorflow.keras import layers, Model
599
+
600
+
601
+ def conv_block(x, filters):
602
+ x = layers.Conv2D(filters, 3, padding="same", activation="relu")(x)
603
+ x = layers.Conv2D(filters, 3, padding="same", activation="relu")(x)
604
+ return x
605
+
606
+
607
+ def build_unet(input_shape=(128, 128, 3), num_classes=1):
608
+ inputs = layers.Input(input_shape)
609
+ c1 = conv_block(inputs, 32); p1 = layers.MaxPool2D()(c1)
610
+ c2 = conv_block(p1, 64); p2 = layers.MaxPool2D()(c2)
611
+ b = conv_block(p2, 128)
612
+ u2 = layers.UpSampling2D()(b); u2 = layers.Concatenate()([u2, c2]); c3 = conv_block(u2, 64)
613
+ u1 = layers.UpSampling2D()(c3); u1 = layers.Concatenate()([u1, c1]); c4 = conv_block(u1, 32)
614
+ outputs = layers.Conv2D(num_classes, 1, activation="sigmoid")(c4)
615
+ return Model(inputs, outputs)
616
+
617
+
618
+ unet = build_unet()
619
+ unet.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
620
+ unet.summary()
621
+ # unet.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=8)
622
+ ''',
623
+ },
624
+ {
625
+ "id": 10,
626
+ "task": "Сегментация медицинских изображений",
627
+ "description": "U-Net/Mask R-CNN direction, Dice/IoU, connected components and error analysis.",
628
+ "code": r'''
629
+ # Task 10. Medical image segmentation template.
630
+ # pip install tensorflow opencv-python scikit-image matplotlib
631
+
632
+ import cv2
633
+ import numpy as np
634
+ import matplotlib.pyplot as plt
635
+ from skimage.measure import label, regionprops
636
+ import tensorflow as tf
637
+ from tensorflow.keras import layers, Model
638
+
639
+
640
+ def dice_coef_np(y_true, y_pred, eps=1e-7):
641
+ y_true = y_true.astype(bool)
642
+ y_pred = y_pred.astype(bool)
643
+ return (2 * np.logical_and(y_true, y_pred).sum() + eps) / (y_true.sum() + y_pred.sum() + eps)
644
+
645
+
646
+ def iou_np(y_true, y_pred, eps=1e-7):
647
+ y_true = y_true.astype(bool)
648
+ y_pred = y_pred.astype(bool)
649
+ inter = np.logical_and(y_true, y_pred).sum()
650
+ union = np.logical_or(y_true, y_pred).sum()
651
+ return (inter + eps) / (union + eps)
652
+
653
+
654
+ def build_unet(input_shape=(256, 256, 1)):
655
+ def block(x, f):
656
+ x = layers.Conv2D(f, 3, padding="same", activation="relu")(x)
657
+ x = layers.BatchNormalization()(x)
658
+ x = layers.Conv2D(f, 3, padding="same", activation="relu")(x)
659
+ return x
660
+
661
+ inp = layers.Input(input_shape)
662
+ c1 = block(inp, 32); p1 = layers.MaxPooling2D()(c1)
663
+ c2 = block(p1, 64); p2 = layers.MaxPooling2D()(c2)
664
+ c3 = block(p2, 128); p3 = layers.MaxPooling2D()(c3)
665
+ b = block(p3, 256)
666
+ u3 = layers.UpSampling2D()(b); u3 = layers.Concatenate()([u3, c3]); d3 = block(u3, 128)
667
+ u2 = layers.UpSampling2D()(d3); u2 = layers.Concatenate()([u2, c2]); d2 = block(u2, 64)
668
+ u1 = layers.UpSampling2D()(d2); u1 = layers.Concatenate()([u1, c1]); d1 = block(u1, 32)
669
+ out = layers.Conv2D(1, 1, activation="sigmoid")(d1)
670
+ return Model(inp, out)
671
+
672
+
673
+ model = build_unet()
674
+ model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss="binary_crossentropy")
675
+
676
+ # Real data placeholders:
677
+ # X_train: float32 images in [0, 1], shape (N, 256, 256, 1)
678
+ # Y_train: binary masks, shape (N, 256, 256, 1)
679
+ # model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=20, batch_size=4)
680
+
681
+
682
+ def analyze_errors(image, true_mask, prob_mask, threshold=0.5):
683
+ pred = prob_mask > threshold
684
+ true = true_mask > 0
685
+ fp = np.logical_and(pred, ~true)
686
+ fn = np.logical_and(~pred, true)
687
+ print("Dice:", dice_coef_np(true, pred))
688
+ print("IoU:", iou_np(true, pred))
689
+ print("false positive pixels:", fp.sum())
690
+ print("false negative pixels:", fn.sum())
691
+
692
+ labeled = label(pred)
693
+ areas = [r.area for r in regionprops(labeled)]
694
+ print("predicted connected components:", len(areas), "areas:", areas[:10])
695
+
696
+ fig, ax = plt.subplots(1, 4, figsize=(14, 4))
697
+ for a, (title, arr, cmap) in zip(ax, [
698
+ ("image", image.squeeze(), "gray"),
699
+ ("true", true, "gray"),
700
+ ("pred", pred, "gray"),
701
+ ("errors: FP red, FN blue", np.dstack([fp, np.zeros_like(fp), fn]).astype(float), None),
702
+ ]):
703
+ a.imshow(arr, cmap=cmap)
704
+ a.set_title(title)
705
+ a.axis("off")
706
+ plt.show()
707
+
708
+
709
+ # Example call after prediction:
710
+ # probs = model.predict(X_val[:1])[0, ..., 0]
711
+ # analyze_errors(X_val[0], Y_val[0, ..., 0], probs)
712
+
713
+ # If instance segmentation is required, use Mask R-CNN from torchvision/detectron2/ultralytics
714
+ # when annotations are object instances rather than one semantic mask.
715
+ ''',
716
+ },
717
+ {
718
+ "id": 11,
719
+ "task": "Классификация изображений с использованием Vision Transformer",
720
+ "description": "Fine-tune torchvision ViT and compare it with a small CNN.",
721
+ "code": r'''
722
+ # Task 11. Vision Transformer classification and CNN comparison.
723
+ # pip install torch torchvision
724
+
725
+ import torch
726
+ from torch import nn
727
+ from torch.utils.data import DataLoader, Subset
728
+ from torchvision import datasets, transforms, models
729
+
730
+
731
+ device = "cuda" if torch.cuda.is_available() else "cpu"
732
+
733
+ weights = models.ViT_B_16_Weights.DEFAULT
734
+ vit_tf = weights.transforms()
735
+ train_ds = datasets.CIFAR10(root="data", train=True, download=True, transform=vit_tf)
736
+ test_ds = datasets.CIFAR10(root="data", train=False, download=True, transform=vit_tf)
737
+
738
+ # Keep a small subset for an exam demo; remove Subset for full training.
739
+ train_ds = Subset(train_ds, range(2000))
740
+ test_ds = Subset(test_ds, range(500))
741
+ train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
742
+ test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=2)
743
+
744
+
745
+ def train_one(model, epochs=1, lr=1e-4):
746
+ model.to(device)
747
+ criterion = nn.CrossEntropyLoss()
748
+ optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
749
+ for epoch in range(epochs):
750
+ model.train()
751
+ for x, y in train_loader:
752
+ x, y = x.to(device), y.to(device)
753
+ logits = model(x)
754
+ loss = criterion(logits, y)
755
+ optimizer.zero_grad()
756
+ loss.backward()
757
+ optimizer.step()
758
+ model.eval()
759
+ correct, total = 0, 0
760
+ with torch.no_grad():
761
+ for x, y in test_loader:
762
+ x, y = x.to(device), y.to(device)
763
+ pred = model(x).argmax(1)
764
+ correct += (pred == y).sum().item()
765
+ total += y.numel()
766
+ return correct / total
767
+
768
+
769
+ vit = models.vit_b_16(weights=weights)
770
+ for p in vit.parameters():
771
+ p.requires_grad = False
772
+ vit.heads.head = nn.Linear(vit.heads.head.in_features, 10)
773
+ vit_acc = train_one(vit, epochs=1, lr=1e-3)
774
+ print("ViT transfer acc:", vit_acc)
775
+
776
+
777
+ class SmallCNN(nn.Module):
778
+ def __init__(self):
779
+ super().__init__()
780
+ self.net = nn.Sequential(
781
+ nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
782
+ nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)),
783
+ nn.Flatten(), nn.Linear(64, 10)
784
+ )
785
+
786
+ def forward(self, x):
787
+ return self.net(x)
788
+
789
+
790
+ cnn_acc = train_one(SmallCNN(), epochs=1, lr=1e-3)
791
+ print("Small CNN acc:", cnn_acc)
792
+
793
+ print("Comparison notes: ViT has global self-attention and benefits from pretraining; "
794
+ "CNN has locality/translation bias and is often cheaper on small data.")
795
+ ''',
796
+ },
797
+ {
798
+ "id": 12,
799
+ "task": "Генерация текстовых описаний изображений",
800
+ "description": "CNN encoder + Transformer decoder skeleton and BLEU evaluation.",
801
+ "code": r'''
802
+ # Task 12. Image captioning: CNN encoder + Transformer decoder skeleton.
803
+ # pip install torch torchvision nltk pillow
804
+
805
+ import math
806
+ import torch
807
+ from torch import nn
808
+ from torchvision import models
809
+ from nltk.translate.bleu_score import corpus_bleu
810
+
811
+
812
+ device = "cuda" if torch.cuda.is_available() else "cpu"
813
+
814
+
815
+ class CNNEncoder(nn.Module):
816
+ def __init__(self, embed_dim=256):
817
+ super().__init__()
818
+ weights = models.ResNet18_Weights.DEFAULT
819
+ resnet = models.resnet18(weights=weights)
820
+ self.backbone = nn.Sequential(*list(resnet.children())[:-2])
821
+ self.proj = nn.Conv2d(512, embed_dim, kernel_size=1)
822
+
823
+ def forward(self, images):
824
+ feats = self.backbone(images) # B, 512, H/32, W/32
825
+ feats = self.proj(feats) # B, D, h, w
826
+ tokens = feats.flatten(2).permute(2, 0, 1) # S, B, D
827
+ return tokens
828
+
829
+
830
+ class PositionalEncoding(nn.Module):
831
+ def __init__(self, d_model, max_len=256):
832
+ super().__init__()
833
+ pe = torch.zeros(max_len, d_model)
834
+ pos = torch.arange(0, max_len).unsqueeze(1)
835
+ div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
836
+ pe[:, 0::2] = torch.sin(pos * div)
837
+ pe[:, 1::2] = torch.cos(pos * div)
838
+ self.register_buffer("pe", pe.unsqueeze(1))
839
+
840
+ def forward(self, x):
841
+ return x + self.pe[: x.size(0)]
842
+
843
+
844
+ class CaptionModel(nn.Module):
845
+ def __init__(self, vocab_size, embed_dim=256, nhead=8, layers=3, pad_id=0):
846
+ super().__init__()
847
+ self.encoder = CNNEncoder(embed_dim)
848
+ self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
849
+ self.pos = PositionalEncoding(embed_dim)
850
+ dec_layer = nn.TransformerDecoderLayer(embed_dim, nhead, dim_feedforward=512)
851
+ self.decoder = nn.TransformerDecoder(dec_layer, num_layers=layers)
852
+ self.out = nn.Linear(embed_dim, vocab_size)
853
+ self.pad_id = pad_id
854
+
855
+ def forward(self, images, captions_in):
856
+ memory = self.encoder(images)
857
+ tgt = self.pos(self.embed(captions_in).transpose(0, 1))
858
+ seq_len = tgt.size(0)
859
+ causal_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(tgt.device)
860
+ decoded = self.decoder(tgt, memory, tgt_mask=causal_mask)
861
+ return self.out(decoded).transpose(0, 1)
862
+
863
+
864
+ # Training idea:
865
+ # captions_in = [BOS, w1, w2, ...]
866
+ # captions_out = [w1, w2, ..., EOS]
867
+ # logits = model(images, captions_in)
868
+ # loss = CrossEntropyLoss(ignore_index=pad_id)(logits.reshape(-1, vocab), captions_out.reshape(-1))
869
+
870
+
871
+ def evaluate_bleu(references, hypotheses):
872
+ # references: list[list[list[str]]], e.g. [[["a", "cat", "sits"]]]
873
+ # hypotheses: list[list[str]], e.g. [["a", "cat", "sits"]]
874
+ return corpus_bleu(references, hypotheses)
875
+
876
+
877
+ refs = [[["a", "cat", "on", "a", "sofa"]], [["a", "dog", "runs"]]]
878
+ hyps = [["a", "cat", "on", "sofa"], ["a", "dog", "runs"]]
879
+ print("BLEU example:", evaluate_bleu(refs, hyps))
880
+ ''',
881
+ },
882
+ {
883
+ "id": 13,
884
+ "task": "Ручная реализация 2D-свёртки и фильтрации",
885
+ "description": "NumPy convolution with stride, padding, several kernels, comparison with cv2.filter2D and timing.",
886
+ "code": r'''
887
+ # Task 13. Manual 2D convolution and filters.
888
+ # pip install opencv-python matplotlib scikit-image
889
+
890
+ import time
891
+ import cv2
892
+ import numpy as np
893
+ import matplotlib.pyplot as plt
894
+ from skimage import data
895
+
896
+
897
+ def conv2d_numpy(image, kernel, stride=1, padding=0):
898
+ image = image.astype(np.float32)
899
+ kernel = np.asarray(kernel, dtype=np.float32)
900
+ if padding:
901
+ image = np.pad(image, ((padding, padding), (padding, padding)), mode="constant")
902
+ kh, kw = kernel.shape
903
+ oh = (image.shape[0] - kh) // stride + 1
904
+ ow = (image.shape[1] - kw) // stride + 1
905
+ out = np.zeros((oh, ow), dtype=np.float32)
906
+ flipped = np.flipud(np.fliplr(kernel))
907
+ for y in range(oh):
908
+ for x in range(ow):
909
+ patch = image[y * stride:y * stride + kh, x * stride:x * stride + kw]
910
+ out[y, x] = np.sum(patch * flipped)
911
+ return out
912
+
913
+
914
+ gray = data.camera().astype(np.float32)
915
+ gray = cv2.resize(gray, (256, 256), interpolation=cv2.INTER_AREA)
916
+
917
+ kernels = {
918
+ "blur": np.ones((3, 3), dtype=np.float32) / 9,
919
+ "sharpen": np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32),
920
+ "sobel_x": np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=np.float32),
921
+ "sobel_y": np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=np.float32),
922
+ }
923
+
924
+ fig, ax = plt.subplots(1, len(kernels) + 1, figsize=(14, 4))
925
+ ax[0].imshow(gray, cmap="gray")
926
+ ax[0].set_title("input")
927
+ ax[0].axis("off")
928
+
929
+ for i, (name, kernel) in enumerate(kernels.items(), start=1):
930
+ t0 = time.perf_counter()
931
+ manual = conv2d_numpy(gray, kernel, stride=1, padding=1)
932
+ t_manual = time.perf_counter() - t0
933
+
934
+ t0 = time.perf_counter()
935
+ cv = cv2.filter2D(gray, ddepth=-1, kernel=kernel)
936
+ t_cv = time.perf_counter() - t0
937
+
938
+ diff = np.mean(np.abs(manual - cv))
939
+ print(f"{name:8s} manual={t_manual:.4f}s, cv2={t_cv:.6f}s, mean_abs_diff={diff:.4f}")
940
+ ax[i].imshow(manual, cmap="gray")
941
+ ax[i].set_title(name)
942
+ ax[i].axis("off")
943
+
944
+ plt.tight_layout()
945
+ plt.show()
946
+ ''',
947
+ },
948
+ {
949
+ "id": 14,
950
+ "task": "Ручная реализация pooling и простого CNN-блока",
951
+ "description": "Manual max/average pooling, Conv -> ReLU -> Pool block and comparison with PyTorch.",
952
+ "code": r'''
953
+ # Task 14. Manual pooling and simple CNN block.
954
+ # pip install torch opencv-python scikit-image
955
+
956
+ import numpy as np
957
+ import torch
958
+ import torch.nn.functional as F
959
+ from skimage import data
960
+ import cv2
961
+
962
+
963
+ def pool2d_numpy(x, kernel_size=2, stride=2, mode="max"):
964
+ h, w = x.shape
965
+ oh = (h - kernel_size) // stride + 1
966
+ ow = (w - kernel_size) // stride + 1
967
+ out = np.zeros((oh, ow), dtype=x.dtype)
968
+ for y in range(oh):
969
+ for x0 in range(ow):
970
+ patch = x[y * stride:y * stride + kernel_size, x0 * stride:x0 * stride + kernel_size]
971
+ out[y, x0] = patch.max() if mode == "max" else patch.mean()
972
+ return out
973
+
974
+
975
+ def conv2d_valid(x, kernel):
976
+ kh, kw = kernel.shape
977
+ oh, ow = x.shape[0] - kh + 1, x.shape[1] - kw + 1
978
+ out = np.zeros((oh, ow), dtype=np.float32)
979
+ for y in range(oh):
980
+ for x0 in range(ow):
981
+ out[y, x0] = np.sum(x[y:y+kh, x0:x0+kw] * kernel)
982
+ return out
983
+
984
+
985
+ img = data.camera().astype(np.float32) / 255.0
986
+ img = cv2.resize(img, (64, 64), interpolation=cv2.INTER_AREA)
987
+ kernel = np.array([[1, 0, -1], [1, 0, -1], [1, 0, -1]], dtype=np.float32)
988
+
989
+ conv = conv2d_valid(img, kernel)
990
+ relu = np.maximum(conv, 0)
991
+ max_pool = pool2d_numpy(relu, kernel_size=2, stride=2, mode="max")
992
+ avg_pool = pool2d_numpy(relu, kernel_size=2, stride=2, mode="avg")
993
+
994
+ print("input:", img.shape)
995
+ print("after conv:", conv.shape)
996
+ print("after relu:", relu.shape)
997
+ print("after max pool:", max_pool.shape)
998
+ print("after avg pool:", avg_pool.shape)
999
+
1000
+ # Compare pooling with PyTorch for the same input.
1001
+ t = torch.tensor(relu)[None, None, :, :]
1002
+ torch_max = F.max_pool2d(t, kernel_size=2, stride=2).squeeze().numpy()
1003
+ torch_avg = F.avg_pool2d(t, kernel_size=2, stride=2).squeeze().numpy()
1004
+ print("max pool diff:", np.abs(torch_max - max_pool).max())
1005
+ print("avg pool diff:", np.abs(torch_avg - avg_pool).max())
1006
+ ''',
1007
+ },
1008
+ {
1009
+ "id": 15,
1010
+ "task": "Ручная реализация self-attention и позиционного кодирования",
1011
+ "description": "Patchify, scaled dot-product attention, sinusoidal positional encoding and tensor shapes.",
1012
+ "code": r'''
1013
+ # Task 15. Patchify, positional encoding and self-attention.
1014
+ # pip install torch
1015
+
1016
+ import math
1017
+ import torch
1018
+
1019
+
1020
+ def patchify(images, patch_size=16):
1021
+ # images: B, C, H, W
1022
+ b, c, h, w = images.shape
1023
+ assert h % patch_size == 0 and w % patch_size == 0
1024
+ patches = images.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
1025
+ # B, C, nH, nW, p, p -> B, N, C*p*p
1026
+ patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
1027
+ return patches.view(b, -1, c * patch_size * patch_size)
1028
+
1029
+
1030
+ def sinusoidal_positional_encoding(seq_len, dim, device="cpu"):
1031
+ pe = torch.zeros(seq_len, dim, device=device)
1032
+ pos = torch.arange(seq_len, device=device).unsqueeze(1)
1033
+ div = torch.exp(torch.arange(0, dim, 2, device=device) * (-math.log(10000.0) / dim))
1034
+ pe[:, 0::2] = torch.sin(pos * div)
1035
+ pe[:, 1::2] = torch.cos(pos * div)
1036
+ return pe
1037
+
1038
+
1039
+ def scaled_dot_product_attention(q, k, v, mask=None):
1040
+ # q, k, v: B, heads, N, d
1041
+ d = q.size(-1)
1042
+ scores = q @ k.transpose(-2, -1) / math.sqrt(d)
1043
+ if mask is not None:
1044
+ scores = scores.masked_fill(mask == 0, float("-inf"))
1045
+ attn = torch.softmax(scores, dim=-1)
1046
+ out = attn @ v
1047
+ return out, attn
1048
+
1049
+
1050
+ def split_heads(x, num_heads):
1051
+ # x: B, N, D -> B, heads, N, D/heads
1052
+ b, n, d = x.shape
1053
+ assert d % num_heads == 0
1054
+ return x.view(b, n, num_heads, d // num_heads).transpose(1, 2)
1055
+
1056
+
1057
+ B, C, H, W = 2, 3, 32, 32
1058
+ patch_size = 8
1059
+ embed_dim = 64
1060
+ num_heads = 4
1061
+
1062
+ images = torch.randn(B, C, H, W)
1063
+ patches = patchify(images, patch_size=patch_size)
1064
+ proj = torch.nn.Linear(patches.size(-1), embed_dim)
1065
+ tokens = proj(patches)
1066
+ tokens = tokens + sinusoidal_positional_encoding(tokens.size(1), embed_dim, tokens.device)
1067
+
1068
+ q_proj = torch.nn.Linear(embed_dim, embed_dim)
1069
+ k_proj = torch.nn.Linear(embed_dim, embed_dim)
1070
+ v_proj = torch.nn.Linear(embed_dim, embed_dim)
1071
+
1072
+ q = split_heads(q_proj(tokens), num_heads)
1073
+ k = split_heads(k_proj(tokens), num_heads)
1074
+ v = split_heads(v_proj(tokens), num_heads)
1075
+ out, attn = scaled_dot_product_attention(q, k, v)
1076
+
1077
+ print("images:", images.shape)
1078
+ print("patches:", patches.shape)
1079
+ print("tokens:", tokens.shape)
1080
+ print("Q/K/V:", q.shape, k.shape, v.shape)
1081
+ print("attention matrix:", attn.shape)
1082
+ print("attention output:", out.shape)
1083
+
1084
+ # Compare with PyTorch MultiheadAttention on the same token shape.
1085
+ mha = torch.nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)
1086
+ lib_out, lib_attn = mha(tokens, tokens, tokens)
1087
+ print("library output:", lib_out.shape)
1088
+ print("library attention:", lib_attn.shape)
1089
+ ''',
1090
+ },
1091
+ ]