cv-study-utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cv_study_utils/__init__.py +25 -0
- cv_study_utils/api.py +271 -0
- cv_study_utils/cli.py +53 -0
- cv_study_utils/practice_solutions.py +1091 -0
- cv_study_utils/sources.py +36 -0
- cv_study_utils/theory_answers.py +290 -0
- cv_study_utils/theory_extras.py +287 -0
- cv_study_utils-0.1.0.dist-info/METADATA +74 -0
- cv_study_utils-0.1.0.dist-info/RECORD +12 -0
- cv_study_utils-0.1.0.dist-info/WHEEL +5 -0
- cv_study_utils-0.1.0.dist-info/entry_points.txt +2 -0
- cv_study_utils-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1091 @@
|
|
|
1
|
+
PRACTICE = [
|
|
2
|
+
{
|
|
3
|
+
"id": 1,
|
|
4
|
+
"task": "Предобработка изображений и анализ цветовых пространств",
|
|
5
|
+
"description": "Resize, normalization, RGB/HSV/Grayscale, histograms and simple statistics.",
|
|
6
|
+
"code": r'''
|
|
7
|
+
# Task 1. Image preprocessing and color-space analysis.
|
|
8
|
+
# pip install opencv-python pillow matplotlib scikit-image
|
|
9
|
+
|
|
10
|
+
import cv2
|
|
11
|
+
import numpy as np
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
from PIL import Image
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_rgb(image_path=None):
|
|
17
|
+
if image_path is None:
|
|
18
|
+
from skimage import data
|
|
19
|
+
return data.astronaut() # RGB uint8 sample
|
|
20
|
+
pil_img = Image.open(image_path).convert("RGB")
|
|
21
|
+
return np.array(pil_img)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
img_rgb = load_rgb(None) # replace None with "your_image.jpg"
|
|
25
|
+
img_resized = cv2.resize(img_rgb, (224, 224), interpolation=cv2.INTER_AREA)
|
|
26
|
+
img_float = img_resized.astype(np.float32) / 255.0
|
|
27
|
+
img_hsv = cv2.cvtColor(img_resized, cv2.COLOR_RGB2HSV)
|
|
28
|
+
img_gray = cv2.cvtColor(img_resized, cv2.COLOR_RGB2GRAY)
|
|
29
|
+
|
|
30
|
+
print("shape:", img_resized.shape)
|
|
31
|
+
print("dtype:", img_resized.dtype)
|
|
32
|
+
print("RGB mean:", img_resized.mean(axis=(0, 1)))
|
|
33
|
+
print("RGB std:", img_resized.std(axis=(0, 1)))
|
|
34
|
+
print("normalized range:", img_float.min(), img_float.max())
|
|
35
|
+
|
|
36
|
+
fig, ax = plt.subplots(2, 3, figsize=(12, 7))
|
|
37
|
+
ax[0, 0].imshow(img_rgb)
|
|
38
|
+
ax[0, 0].set_title("Original RGB")
|
|
39
|
+
ax[0, 1].imshow(img_resized)
|
|
40
|
+
ax[0, 1].set_title("Resized 224x224")
|
|
41
|
+
ax[0, 2].imshow(img_gray, cmap="gray")
|
|
42
|
+
ax[0, 2].set_title("Grayscale")
|
|
43
|
+
|
|
44
|
+
colors = ["r", "g", "b"]
|
|
45
|
+
for c, color in enumerate(colors):
|
|
46
|
+
ax[1, 0].hist(img_resized[:, :, c].ravel(), bins=256, range=(0, 255), color=color, alpha=0.5)
|
|
47
|
+
ax[1, 0].set_title("RGB channel histograms")
|
|
48
|
+
|
|
49
|
+
ax[1, 1].hist(img_gray.ravel(), bins=256, range=(0, 255), color="gray")
|
|
50
|
+
ax[1, 1].set_title("Brightness histogram")
|
|
51
|
+
|
|
52
|
+
h, s, v = cv2.split(img_hsv)
|
|
53
|
+
ax[1, 2].hist(h.ravel(), bins=180, range=(0, 179), color="orange", alpha=0.7, label="H")
|
|
54
|
+
ax[1, 2].hist(s.ravel(), bins=256, range=(0, 255), color="purple", alpha=0.4, label="S")
|
|
55
|
+
ax[1, 2].hist(v.ravel(), bins=256, range=(0, 255), color="black", alpha=0.3, label="V")
|
|
56
|
+
ax[1, 2].legend()
|
|
57
|
+
ax[1, 2].set_title("HSV histograms")
|
|
58
|
+
|
|
59
|
+
for a in ax.ravel():
|
|
60
|
+
a.axis("off") if a in ax[0] else None
|
|
61
|
+
plt.tight_layout()
|
|
62
|
+
plt.show()
|
|
63
|
+
''',
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"id": 2,
|
|
67
|
+
"task": "Подавление шума и фильтрация изображений",
|
|
68
|
+
"description": "Gaussian noise, salt-and-pepper noise, Gaussian/median/bilateral filtering, PSNR and SSIM.",
|
|
69
|
+
"code": r'''
|
|
70
|
+
# Task 2. Noise suppression and filtering.
|
|
71
|
+
# pip install opencv-python matplotlib scikit-image
|
|
72
|
+
|
|
73
|
+
import cv2
|
|
74
|
+
import numpy as np
|
|
75
|
+
import matplotlib.pyplot as plt
|
|
76
|
+
from skimage import data
|
|
77
|
+
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
img = data.astronaut()
|
|
81
|
+
img = cv2.resize(img, (256, 256), interpolation=cv2.INTER_AREA)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def gaussian_noise(image, sigma=25):
|
|
85
|
+
noise = np.random.normal(0, sigma, image.shape)
|
|
86
|
+
return np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def salt_pepper_noise(image, amount=0.04):
|
|
90
|
+
noisy = image.copy()
|
|
91
|
+
h, w = noisy.shape[:2]
|
|
92
|
+
n = int(amount * h * w)
|
|
93
|
+
ys = np.random.randint(0, h, n)
|
|
94
|
+
xs = np.random.randint(0, w, n)
|
|
95
|
+
half = n // 2
|
|
96
|
+
noisy[ys[:half], xs[:half]] = 0
|
|
97
|
+
noisy[ys[half:], xs[half:]] = 255
|
|
98
|
+
return noisy
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def score(reference, test):
|
|
102
|
+
psnr = peak_signal_noise_ratio(reference, test, data_range=255)
|
|
103
|
+
ssim = structural_similarity(reference, test, data_range=255, channel_axis=-1)
|
|
104
|
+
return psnr, ssim
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
noisy_gauss = gaussian_noise(img, sigma=25)
|
|
108
|
+
noisy_sp = salt_pepper_noise(img, amount=0.05)
|
|
109
|
+
|
|
110
|
+
variants = {
|
|
111
|
+
"gaussian noise": noisy_gauss,
|
|
112
|
+
"gaussian blur": cv2.GaussianBlur(noisy_gauss, (5, 5), 0),
|
|
113
|
+
"median filter": cv2.medianBlur(noisy_sp, 5),
|
|
114
|
+
"bilateral filter": cv2.bilateralFilter(noisy_gauss, d=9, sigmaColor=75, sigmaSpace=75),
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
for name, image in variants.items():
|
|
118
|
+
psnr, ssim = score(img, image)
|
|
119
|
+
print(f"{name:16s} PSNR={psnr:6.2f}, SSIM={ssim:.4f}")
|
|
120
|
+
|
|
121
|
+
fig, ax = plt.subplots(2, 3, figsize=(12, 8))
|
|
122
|
+
show = [("original", img), ("gaussian noise", noisy_gauss), ("salt-pepper", noisy_sp),
|
|
123
|
+
("gaussian blur", variants["gaussian blur"]), ("median", variants["median filter"]),
|
|
124
|
+
("bilateral", variants["bilateral filter"])]
|
|
125
|
+
for a, (title, image) in zip(ax.ravel(), show):
|
|
126
|
+
a.imshow(image)
|
|
127
|
+
a.set_title(title)
|
|
128
|
+
a.axis("off")
|
|
129
|
+
plt.tight_layout()
|
|
130
|
+
plt.show()
|
|
131
|
+
''',
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"id": 3,
|
|
135
|
+
"task": "Обнаружение границ и выделение контуров",
|
|
136
|
+
"description": "Sobel, Laplacian, Canny, threshold selection and contours.",
|
|
137
|
+
"code": r'''
|
|
138
|
+
# Task 3. Edges and contours.
|
|
139
|
+
# pip install opencv-python matplotlib scikit-image
|
|
140
|
+
|
|
141
|
+
import cv2
|
|
142
|
+
import numpy as np
|
|
143
|
+
import matplotlib.pyplot as plt
|
|
144
|
+
from skimage import data
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
img_rgb = data.camera()
|
|
148
|
+
if img_rgb.ndim == 2:
|
|
149
|
+
gray = img_rgb
|
|
150
|
+
else:
|
|
151
|
+
gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
|
|
152
|
+
|
|
153
|
+
gray = cv2.resize(gray, (320, 320), interpolation=cv2.INTER_AREA)
|
|
154
|
+
blur = cv2.GaussianBlur(gray, (5, 5), 0)
|
|
155
|
+
|
|
156
|
+
sobel_x = cv2.Sobel(blur, cv2.CV_64F, 1, 0, ksize=3)
|
|
157
|
+
sobel_y = cv2.Sobel(blur, cv2.CV_64F, 0, 1, ksize=3)
|
|
158
|
+
sobel_mag = cv2.magnitude(sobel_x, sobel_y)
|
|
159
|
+
sobel_mag = cv2.normalize(sobel_mag, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
|
|
160
|
+
|
|
161
|
+
laplacian = cv2.Laplacian(blur, cv2.CV_64F)
|
|
162
|
+
laplacian = cv2.convertScaleAbs(laplacian)
|
|
163
|
+
|
|
164
|
+
threshold_pairs = [(30, 90), (50, 150), (100, 200)]
|
|
165
|
+
canny_maps = [cv2.Canny(blur, lo, hi) for lo, hi in threshold_pairs]
|
|
166
|
+
|
|
167
|
+
# Contours from the middle Canny result.
|
|
168
|
+
contours, _ = cv2.findContours(canny_maps[1], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
169
|
+
canvas = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
|
|
170
|
+
cv2.drawContours(canvas, contours, -1, (255, 0, 0), 1)
|
|
171
|
+
print("number of contours:", len(contours))
|
|
172
|
+
|
|
173
|
+
fig, ax = plt.subplots(2, 4, figsize=(14, 7))
|
|
174
|
+
items = [("gray", gray), ("sobel", sobel_mag), ("laplacian", laplacian)]
|
|
175
|
+
items += [(f"canny {lo}-{hi}", c) for (lo, hi), c in zip(threshold_pairs, canny_maps)]
|
|
176
|
+
items += [("contours", canvas)]
|
|
177
|
+
|
|
178
|
+
for a, (title, image) in zip(ax.ravel(), items):
|
|
179
|
+
if image.ndim == 2:
|
|
180
|
+
a.imshow(image, cmap="gray")
|
|
181
|
+
else:
|
|
182
|
+
a.imshow(image)
|
|
183
|
+
a.set_title(title)
|
|
184
|
+
a.axis("off")
|
|
185
|
+
|
|
186
|
+
for a in ax.ravel()[len(items):]:
|
|
187
|
+
a.axis("off")
|
|
188
|
+
plt.tight_layout()
|
|
189
|
+
plt.show()
|
|
190
|
+
''',
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"id": 4,
|
|
194
|
+
"task": "Классификация изображений традиционными методами МО",
|
|
195
|
+
"description": "HOG and LBP features, k-NN/SVM classifiers, metrics and confusion matrix.",
|
|
196
|
+
"code": r'''
|
|
197
|
+
# Task 4. Classical image classification with HOG/LBP + k-NN/SVM.
|
|
198
|
+
# pip install scikit-image scikit-learn matplotlib
|
|
199
|
+
|
|
200
|
+
import numpy as np
|
|
201
|
+
import matplotlib.pyplot as plt
|
|
202
|
+
from skimage.feature import hog, local_binary_pattern
|
|
203
|
+
from sklearn.datasets import load_digits
|
|
204
|
+
from sklearn.model_selection import train_test_split
|
|
205
|
+
from sklearn.preprocessing import StandardScaler
|
|
206
|
+
from sklearn.pipeline import make_pipeline
|
|
207
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
208
|
+
from sklearn.svm import SVC
|
|
209
|
+
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
digits = load_digits()
|
|
213
|
+
X_img = digits.images # 8x8 grayscale images
|
|
214
|
+
y = digits.target
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def extract_features(image):
|
|
218
|
+
image = image.astype(np.float32) / 16.0
|
|
219
|
+
hog_feat = hog(image, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(1, 1), feature_vector=True)
|
|
220
|
+
lbp = local_binary_pattern(image, P=8, R=1, method="uniform")
|
|
221
|
+
lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10), density=True)
|
|
222
|
+
return np.hstack([hog_feat, lbp_hist])
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
X = np.vstack([extract_features(im) for im in X_img])
|
|
226
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
|
|
227
|
+
|
|
228
|
+
models = {
|
|
229
|
+
"kNN": make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)),
|
|
230
|
+
"SVM": make_pipeline(StandardScaler(), SVC(kernel="rbf", C=10, gamma="scale")),
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
for name, model in models.items():
|
|
234
|
+
model.fit(X_train, y_train)
|
|
235
|
+
pred = model.predict(X_test)
|
|
236
|
+
print("\n", name)
|
|
237
|
+
print("accuracy:", accuracy_score(y_test, pred))
|
|
238
|
+
print(classification_report(y_test, pred))
|
|
239
|
+
ConfusionMatrixDisplay.from_predictions(y_test, pred, cmap="Blues")
|
|
240
|
+
plt.title(f"{name} confusion matrix")
|
|
241
|
+
plt.show()
|
|
242
|
+
''',
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
"id": 5,
|
|
246
|
+
"task": "Сравнение ИНС и классических алгоритмов",
|
|
247
|
+
"description": "Compare SVM + HOG and MLP by accuracy, train/inference time and robustness to noise.",
|
|
248
|
+
"code": r'''
|
|
249
|
+
# Task 5. SVM+HOG vs MLP: quality, time, robustness.
|
|
250
|
+
# pip install scikit-image scikit-learn matplotlib
|
|
251
|
+
|
|
252
|
+
import time
|
|
253
|
+
import numpy as np
|
|
254
|
+
from skimage.feature import hog
|
|
255
|
+
from sklearn.datasets import load_digits
|
|
256
|
+
from sklearn.model_selection import train_test_split
|
|
257
|
+
from sklearn.pipeline import make_pipeline
|
|
258
|
+
from sklearn.preprocessing import StandardScaler
|
|
259
|
+
from sklearn.svm import LinearSVC
|
|
260
|
+
from sklearn.neural_network import MLPClassifier
|
|
261
|
+
from sklearn.metrics import accuracy_score
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
digits = load_digits()
|
|
265
|
+
images = digits.images.astype(np.float32) / 16.0
|
|
266
|
+
y = digits.target
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def hog_features(batch):
|
|
270
|
+
return np.vstack([
|
|
271
|
+
hog(im, orientations=8, pixels_per_cell=(4, 4), cells_per_block=(1, 1), feature_vector=True)
|
|
272
|
+
for im in batch
|
|
273
|
+
])
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
X_hog = hog_features(images)
|
|
277
|
+
X_flat = images.reshape(len(images), -1)
|
|
278
|
+
idx_train, idx_test = train_test_split(np.arange(len(images)), test_size=0.25, stratify=y, random_state=42)
|
|
279
|
+
|
|
280
|
+
svm = make_pipeline(StandardScaler(), LinearSVC(C=1.0, max_iter=5000))
|
|
281
|
+
mlp = make_pipeline(
|
|
282
|
+
StandardScaler(),
|
|
283
|
+
MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", max_iter=80, random_state=42)
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
experiments = [
|
|
287
|
+
("SVM + HOG", svm, X_hog[idx_train], X_hog[idx_test]),
|
|
288
|
+
("MLP + pixels", mlp, X_flat[idx_train], X_flat[idx_test]),
|
|
289
|
+
]
|
|
290
|
+
|
|
291
|
+
for name, model, Xtr, Xte in experiments:
|
|
292
|
+
t0 = time.perf_counter()
|
|
293
|
+
model.fit(Xtr, y[idx_train])
|
|
294
|
+
train_time = time.perf_counter() - t0
|
|
295
|
+
t0 = time.perf_counter()
|
|
296
|
+
pred = model.predict(Xte)
|
|
297
|
+
infer_time = time.perf_counter() - t0
|
|
298
|
+
print(f"{name:14s} acc={accuracy_score(y[idx_test], pred):.4f}, "
|
|
299
|
+
f"train={train_time:.3f}s, inference={infer_time:.4f}s")
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def add_noise(batch, sigma=0.25):
|
|
303
|
+
return np.clip(batch + np.random.normal(0, sigma, batch.shape), 0, 1)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
noisy = add_noise(images[idx_test], sigma=0.25)
|
|
307
|
+
svm_pred = svm.predict(hog_features(noisy))
|
|
308
|
+
mlp_pred = mlp.predict(noisy.reshape(len(noisy), -1))
|
|
309
|
+
print("noise robustness:")
|
|
310
|
+
print("SVM + HOG:", accuracy_score(y[idx_test], svm_pred))
|
|
311
|
+
print("MLP + pixels:", accuracy_score(y[idx_test], mlp_pred))
|
|
312
|
+
''',
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
"id": 6,
|
|
316
|
+
"task": "Классификация изображений с использованием CNN",
|
|
317
|
+
"description": "PyTorch CNN with two convolutional layers, pooling, loss/accuracy plots and overfitting check.",
|
|
318
|
+
"code": r'''
|
|
319
|
+
# Task 6. CNN image classification in PyTorch.
|
|
320
|
+
# pip install torch torchvision matplotlib
|
|
321
|
+
|
|
322
|
+
import torch
|
|
323
|
+
from torch import nn
|
|
324
|
+
from torch.utils.data import DataLoader
|
|
325
|
+
from torchvision import datasets, transforms
|
|
326
|
+
import matplotlib.pyplot as plt
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
330
|
+
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
|
|
331
|
+
|
|
332
|
+
train_ds = datasets.FashionMNIST(root="data", train=True, download=True, transform=transform)
|
|
333
|
+
test_ds = datasets.FashionMNIST(root="data", train=False, download=True, transform=transform)
|
|
334
|
+
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2)
|
|
335
|
+
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False, num_workers=2)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class SmallCNN(nn.Module):
|
|
339
|
+
def __init__(self, num_classes=10):
|
|
340
|
+
super().__init__()
|
|
341
|
+
self.features = nn.Sequential(
|
|
342
|
+
nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
|
|
343
|
+
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
|
|
344
|
+
)
|
|
345
|
+
self.classifier = nn.Sequential(
|
|
346
|
+
nn.Flatten(),
|
|
347
|
+
nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Dropout(0.3),
|
|
348
|
+
nn.Linear(128, num_classes),
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def forward(self, x):
|
|
352
|
+
return self.classifier(self.features(x))
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
model = SmallCNN().to(device)
|
|
356
|
+
criterion = nn.CrossEntropyLoss()
|
|
357
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def run_epoch(loader, train=True):
|
|
361
|
+
model.train(train)
|
|
362
|
+
total_loss, correct, total = 0.0, 0, 0
|
|
363
|
+
for x, y in loader:
|
|
364
|
+
x, y = x.to(device), y.to(device)
|
|
365
|
+
with torch.set_grad_enabled(train):
|
|
366
|
+
logits = model(x)
|
|
367
|
+
loss = criterion(logits, y)
|
|
368
|
+
if train:
|
|
369
|
+
optimizer.zero_grad()
|
|
370
|
+
loss.backward()
|
|
371
|
+
optimizer.step()
|
|
372
|
+
total_loss += loss.item() * x.size(0)
|
|
373
|
+
correct += (logits.argmax(1) == y).sum().item()
|
|
374
|
+
total += x.size(0)
|
|
375
|
+
return total_loss / total, correct / total
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
history = {"train_loss": [], "test_loss": [], "train_acc": [], "test_acc": []}
|
|
379
|
+
for epoch in range(5):
|
|
380
|
+
tr_loss, tr_acc = run_epoch(train_loader, train=True)
|
|
381
|
+
te_loss, te_acc = run_epoch(test_loader, train=False)
|
|
382
|
+
history["train_loss"].append(tr_loss)
|
|
383
|
+
history["test_loss"].append(te_loss)
|
|
384
|
+
history["train_acc"].append(tr_acc)
|
|
385
|
+
history["test_acc"].append(te_acc)
|
|
386
|
+
print(f"epoch {epoch+1}: train acc={tr_acc:.3f}, test acc={te_acc:.3f}")
|
|
387
|
+
|
|
388
|
+
plt.figure(figsize=(10, 4))
|
|
389
|
+
plt.subplot(1, 2, 1)
|
|
390
|
+
plt.plot(history["train_loss"], label="train")
|
|
391
|
+
plt.plot(history["test_loss"], label="test")
|
|
392
|
+
plt.title("Loss")
|
|
393
|
+
plt.legend()
|
|
394
|
+
plt.subplot(1, 2, 2)
|
|
395
|
+
plt.plot(history["train_acc"], label="train")
|
|
396
|
+
plt.plot(history["test_acc"], label="test")
|
|
397
|
+
plt.title("Accuracy")
|
|
398
|
+
plt.legend()
|
|
399
|
+
plt.show()
|
|
400
|
+
''',
|
|
401
|
+
},
|
|
402
|
+
{
|
|
403
|
+
"id": 7,
|
|
404
|
+
"task": "Transfer Learning и дообучение моделей",
|
|
405
|
+
"description": "Fine-tune ResNet/VGG on a custom ImageFolder dataset and compare with training from scratch.",
|
|
406
|
+
"code": r'''
|
|
407
|
+
# Task 7. Transfer learning with torchvision.
|
|
408
|
+
# Dataset format:
|
|
409
|
+
# data/train/class_a/*.jpg, data/train/class_b/*.jpg
|
|
410
|
+
# data/val/class_a/*.jpg, data/val/class_b/*.jpg
|
|
411
|
+
# pip install torch torchvision
|
|
412
|
+
|
|
413
|
+
import copy
|
|
414
|
+
import time
|
|
415
|
+
import torch
|
|
416
|
+
from torch import nn
|
|
417
|
+
from torch.utils.data import DataLoader
|
|
418
|
+
from torchvision import datasets, transforms, models
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
422
|
+
data_dir = "data" # change to your dataset root
|
|
423
|
+
|
|
424
|
+
weights = models.ResNet50_Weights.DEFAULT
|
|
425
|
+
preprocess = weights.transforms()
|
|
426
|
+
train_tf = transforms.Compose([
|
|
427
|
+
transforms.Resize((256, 256)),
|
|
428
|
+
transforms.RandomResizedCrop(224),
|
|
429
|
+
transforms.RandomHorizontalFlip(),
|
|
430
|
+
transforms.ToTensor(),
|
|
431
|
+
transforms.Normalize(mean=weights.meta["mean"], std=weights.meta["std"]),
|
|
432
|
+
])
|
|
433
|
+
val_tf = preprocess
|
|
434
|
+
|
|
435
|
+
train_ds = datasets.ImageFolder(f"{data_dir}/train", transform=train_tf)
|
|
436
|
+
val_ds = datasets.ImageFolder(f"{data_dir}/val", transform=val_tf)
|
|
437
|
+
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
|
|
438
|
+
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=2)
|
|
439
|
+
num_classes = len(train_ds.classes)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def make_resnet50(pretrained=True, freeze_backbone=True):
|
|
443
|
+
model = models.resnet50(weights=weights if pretrained else None)
|
|
444
|
+
if freeze_backbone:
|
|
445
|
+
for p in model.parameters():
|
|
446
|
+
p.requires_grad = False
|
|
447
|
+
model.fc = nn.Linear(model.fc.in_features, num_classes)
|
|
448
|
+
return model.to(device)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def train_model(model, epochs=3, lr=1e-3):
|
|
452
|
+
criterion = nn.CrossEntropyLoss()
|
|
453
|
+
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
|
|
454
|
+
best_acc, best_state = 0.0, copy.deepcopy(model.state_dict())
|
|
455
|
+
for epoch in range(epochs):
|
|
456
|
+
for phase, loader in [("train", train_loader), ("val", val_loader)]:
|
|
457
|
+
model.train(phase == "train")
|
|
458
|
+
correct, total, loss_sum = 0, 0, 0.0
|
|
459
|
+
for x, y in loader:
|
|
460
|
+
x, y = x.to(device), y.to(device)
|
|
461
|
+
with torch.set_grad_enabled(phase == "train"):
|
|
462
|
+
logits = model(x)
|
|
463
|
+
loss = criterion(logits, y)
|
|
464
|
+
if phase == "train":
|
|
465
|
+
optimizer.zero_grad()
|
|
466
|
+
loss.backward()
|
|
467
|
+
optimizer.step()
|
|
468
|
+
correct += (logits.argmax(1) == y).sum().item()
|
|
469
|
+
total += y.numel()
|
|
470
|
+
loss_sum += loss.item() * y.numel()
|
|
471
|
+
acc = correct / total
|
|
472
|
+
print(f"epoch {epoch+1} {phase}: loss={loss_sum/total:.4f}, acc={acc:.4f}")
|
|
473
|
+
if phase == "val" and acc > best_acc:
|
|
474
|
+
best_acc, best_state = acc, copy.deepcopy(model.state_dict())
|
|
475
|
+
model.load_state_dict(best_state)
|
|
476
|
+
return best_acc
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
start = time.perf_counter()
|
|
480
|
+
transfer_model = make_resnet50(pretrained=True, freeze_backbone=True)
|
|
481
|
+
transfer_acc = train_model(transfer_model, epochs=3, lr=1e-3)
|
|
482
|
+
print("transfer learning best acc:", transfer_acc, "time:", time.perf_counter() - start)
|
|
483
|
+
|
|
484
|
+
# Baseline from scratch. Usually needs more data/epochs.
|
|
485
|
+
scratch_model = make_resnet50(pretrained=False, freeze_backbone=False)
|
|
486
|
+
scratch_acc = train_model(scratch_model, epochs=3, lr=1e-4)
|
|
487
|
+
print("scratch best acc:", scratch_acc)
|
|
488
|
+
''',
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
"id": 8,
|
|
492
|
+
"task": "Детекция объектов на изображениях",
|
|
493
|
+
"description": "YOLO inference on images/video and simple IoU/mAP placeholders.",
|
|
494
|
+
"code": r'''
|
|
495
|
+
# Task 8. Object detection with YOLO.
|
|
496
|
+
# pip install ultralytics opencv-python matplotlib
|
|
497
|
+
|
|
498
|
+
from ultralytics import YOLO
|
|
499
|
+
import cv2
|
|
500
|
+
import numpy as np
|
|
501
|
+
import matplotlib.pyplot as plt
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
model = YOLO("yolo11n.pt") # or "yolov8n.pt" if that is what your environment has
|
|
505
|
+
|
|
506
|
+
# Image inference.
|
|
507
|
+
image_path = "image.jpg" # replace with your file
|
|
508
|
+
results = model.predict(source=image_path, conf=0.25, save=True)
|
|
509
|
+
result = results[0]
|
|
510
|
+
print(result.boxes)
|
|
511
|
+
|
|
512
|
+
annotated = result.plot() # BGR array
|
|
513
|
+
plt.imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
|
|
514
|
+
plt.axis("off")
|
|
515
|
+
plt.show()
|
|
516
|
+
|
|
517
|
+
# Video inference.
|
|
518
|
+
# model.predict(source="video.mp4", conf=0.25, save=True, stream=False)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def iou_xyxy(box_a, box_b):
|
|
522
|
+
ax1, ay1, ax2, ay2 = box_a
|
|
523
|
+
bx1, by1, bx2, by2 = box_b
|
|
524
|
+
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
|
525
|
+
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
|
526
|
+
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
527
|
+
area_a = max(0, ax2 - ax1) * max(0, ay2 - ay1)
|
|
528
|
+
area_b = max(0, bx2 - bx1) * max(0, by2 - by1)
|
|
529
|
+
union = area_a + area_b - inter
|
|
530
|
+
return inter / union if union else 0.0
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
# Example: evaluate one predicted box against one ground-truth box.
|
|
534
|
+
pred_box = np.array([50, 40, 180, 200])
|
|
535
|
+
true_box = np.array([60, 50, 170, 210])
|
|
536
|
+
print("IoU example:", iou_xyxy(pred_box, true_box))
|
|
537
|
+
|
|
538
|
+
# For real mAP use dataset YAML and validation:
|
|
539
|
+
# metrics = model.val(data="coco8.yaml")
|
|
540
|
+
# print(metrics.box.map, metrics.box.map50)
|
|
541
|
+
''',
|
|
542
|
+
},
|
|
543
|
+
{
|
|
544
|
+
"id": 9,
|
|
545
|
+
"task": "Семантическая сегментация изображений",
|
|
546
|
+
"description": "Threshold segmentation, watershed, small U-Net and Dice/IoU metrics.",
|
|
547
|
+
"code": r'''
|
|
548
|
+
# Task 9. Semantic segmentation: threshold, watershed, U-Net metrics.
|
|
549
|
+
# pip install opencv-python scikit-image matplotlib tensorflow
|
|
550
|
+
|
|
551
|
+
import cv2
|
|
552
|
+
import numpy as np
|
|
553
|
+
import matplotlib.pyplot as plt
|
|
554
|
+
from skimage import data
|
|
555
|
+
from skimage.segmentation import watershed
|
|
556
|
+
from skimage.feature import peak_local_max
|
|
557
|
+
from scipy import ndimage as ndi
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
gray = data.coins()
|
|
561
|
+
gray = cv2.resize(gray, (256, 256), interpolation=cv2.INTER_AREA)
|
|
562
|
+
blur = cv2.GaussianBlur(gray, (5, 5), 0)
|
|
563
|
+
_, otsu = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
564
|
+
|
|
565
|
+
distance = ndi.distance_transform_edt(otsu)
|
|
566
|
+
coords = peak_local_max(distance, footprint=np.ones((15, 15)), labels=otsu)
|
|
567
|
+
markers = np.zeros(distance.shape, dtype=np.int32)
|
|
568
|
+
markers[tuple(coords.T)] = np.arange(1, len(coords) + 1)
|
|
569
|
+
labels = watershed(-distance, markers, mask=otsu.astype(bool))
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def dice_coef(y_true, y_pred, eps=1e-7):
|
|
573
|
+
y_true = y_true.astype(bool)
|
|
574
|
+
y_pred = y_pred.astype(bool)
|
|
575
|
+
return (2 * np.logical_and(y_true, y_pred).sum() + eps) / (y_true.sum() + y_pred.sum() + eps)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def iou_mask(y_true, y_pred, eps=1e-7):
|
|
579
|
+
y_true = y_true.astype(bool)
|
|
580
|
+
y_pred = y_pred.astype(bool)
|
|
581
|
+
inter = np.logical_and(y_true, y_pred).sum()
|
|
582
|
+
union = np.logical_or(y_true, y_pred).sum()
|
|
583
|
+
return (inter + eps) / (union + eps)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
print("Example Dice self-check:", dice_coef(otsu > 0, otsu > 0))
|
|
587
|
+
print("Example IoU self-check:", iou_mask(otsu > 0, otsu > 0))
|
|
588
|
+
|
|
589
|
+
fig, ax = plt.subplots(1, 4, figsize=(14, 4))
|
|
590
|
+
for a, (title, image) in zip(ax, [("gray", gray), ("otsu", otsu), ("distance", distance), ("watershed", labels)]):
|
|
591
|
+
a.imshow(image, cmap="gray" if title != "watershed" else "nipy_spectral")
|
|
592
|
+
a.set_title(title)
|
|
593
|
+
a.axis("off")
|
|
594
|
+
plt.show()
|
|
595
|
+
|
|
596
|
+
# Minimal U-Net skeleton for real image/mask tensors X_train, Y_train.
|
|
597
|
+
import tensorflow as tf
|
|
598
|
+
from tensorflow.keras import layers, Model
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def conv_block(x, filters):
|
|
602
|
+
x = layers.Conv2D(filters, 3, padding="same", activation="relu")(x)
|
|
603
|
+
x = layers.Conv2D(filters, 3, padding="same", activation="relu")(x)
|
|
604
|
+
return x
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def build_unet(input_shape=(128, 128, 3), num_classes=1):
|
|
608
|
+
inputs = layers.Input(input_shape)
|
|
609
|
+
c1 = conv_block(inputs, 32); p1 = layers.MaxPool2D()(c1)
|
|
610
|
+
c2 = conv_block(p1, 64); p2 = layers.MaxPool2D()(c2)
|
|
611
|
+
b = conv_block(p2, 128)
|
|
612
|
+
u2 = layers.UpSampling2D()(b); u2 = layers.Concatenate()([u2, c2]); c3 = conv_block(u2, 64)
|
|
613
|
+
u1 = layers.UpSampling2D()(c3); u1 = layers.Concatenate()([u1, c1]); c4 = conv_block(u1, 32)
|
|
614
|
+
outputs = layers.Conv2D(num_classes, 1, activation="sigmoid")(c4)
|
|
615
|
+
return Model(inputs, outputs)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
unet = build_unet()
|
|
619
|
+
unet.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
|
|
620
|
+
unet.summary()
|
|
621
|
+
# unet.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=8)
|
|
622
|
+
''',
|
|
623
|
+
},
|
|
624
|
+
{
|
|
625
|
+
"id": 10,
|
|
626
|
+
"task": "Сегментация медицинских изображений",
|
|
627
|
+
"description": "U-Net/Mask R-CNN direction, Dice/IoU, connected components and error analysis.",
|
|
628
|
+
"code": r'''
|
|
629
|
+
# Task 10. Medical image segmentation template.
|
|
630
|
+
# pip install tensorflow opencv-python scikit-image matplotlib
|
|
631
|
+
|
|
632
|
+
import cv2
|
|
633
|
+
import numpy as np
|
|
634
|
+
import matplotlib.pyplot as plt
|
|
635
|
+
from skimage.measure import label, regionprops
|
|
636
|
+
import tensorflow as tf
|
|
637
|
+
from tensorflow.keras import layers, Model
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def dice_coef_np(y_true, y_pred, eps=1e-7):
|
|
641
|
+
y_true = y_true.astype(bool)
|
|
642
|
+
y_pred = y_pred.astype(bool)
|
|
643
|
+
return (2 * np.logical_and(y_true, y_pred).sum() + eps) / (y_true.sum() + y_pred.sum() + eps)
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def iou_np(y_true, y_pred, eps=1e-7):
|
|
647
|
+
y_true = y_true.astype(bool)
|
|
648
|
+
y_pred = y_pred.astype(bool)
|
|
649
|
+
inter = np.logical_and(y_true, y_pred).sum()
|
|
650
|
+
union = np.logical_or(y_true, y_pred).sum()
|
|
651
|
+
return (inter + eps) / (union + eps)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def build_unet(input_shape=(256, 256, 1)):
|
|
655
|
+
def block(x, f):
|
|
656
|
+
x = layers.Conv2D(f, 3, padding="same", activation="relu")(x)
|
|
657
|
+
x = layers.BatchNormalization()(x)
|
|
658
|
+
x = layers.Conv2D(f, 3, padding="same", activation="relu")(x)
|
|
659
|
+
return x
|
|
660
|
+
|
|
661
|
+
inp = layers.Input(input_shape)
|
|
662
|
+
c1 = block(inp, 32); p1 = layers.MaxPooling2D()(c1)
|
|
663
|
+
c2 = block(p1, 64); p2 = layers.MaxPooling2D()(c2)
|
|
664
|
+
c3 = block(p2, 128); p3 = layers.MaxPooling2D()(c3)
|
|
665
|
+
b = block(p3, 256)
|
|
666
|
+
u3 = layers.UpSampling2D()(b); u3 = layers.Concatenate()([u3, c3]); d3 = block(u3, 128)
|
|
667
|
+
u2 = layers.UpSampling2D()(d3); u2 = layers.Concatenate()([u2, c2]); d2 = block(u2, 64)
|
|
668
|
+
u1 = layers.UpSampling2D()(d2); u1 = layers.Concatenate()([u1, c1]); d1 = block(u1, 32)
|
|
669
|
+
out = layers.Conv2D(1, 1, activation="sigmoid")(d1)
|
|
670
|
+
return Model(inp, out)
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
model = build_unet()
|
|
674
|
+
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss="binary_crossentropy")
|
|
675
|
+
|
|
676
|
+
# Real data placeholders:
|
|
677
|
+
# X_train: float32 images in [0, 1], shape (N, 256, 256, 1)
|
|
678
|
+
# Y_train: binary masks, shape (N, 256, 256, 1)
|
|
679
|
+
# model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=20, batch_size=4)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def analyze_errors(image, true_mask, prob_mask, threshold=0.5):
|
|
683
|
+
pred = prob_mask > threshold
|
|
684
|
+
true = true_mask > 0
|
|
685
|
+
fp = np.logical_and(pred, ~true)
|
|
686
|
+
fn = np.logical_and(~pred, true)
|
|
687
|
+
print("Dice:", dice_coef_np(true, pred))
|
|
688
|
+
print("IoU:", iou_np(true, pred))
|
|
689
|
+
print("false positive pixels:", fp.sum())
|
|
690
|
+
print("false negative pixels:", fn.sum())
|
|
691
|
+
|
|
692
|
+
labeled = label(pred)
|
|
693
|
+
areas = [r.area for r in regionprops(labeled)]
|
|
694
|
+
print("predicted connected components:", len(areas), "areas:", areas[:10])
|
|
695
|
+
|
|
696
|
+
fig, ax = plt.subplots(1, 4, figsize=(14, 4))
|
|
697
|
+
for a, (title, arr, cmap) in zip(ax, [
|
|
698
|
+
("image", image.squeeze(), "gray"),
|
|
699
|
+
("true", true, "gray"),
|
|
700
|
+
("pred", pred, "gray"),
|
|
701
|
+
("errors: FP red, FN blue", np.dstack([fp, np.zeros_like(fp), fn]).astype(float), None),
|
|
702
|
+
]):
|
|
703
|
+
a.imshow(arr, cmap=cmap)
|
|
704
|
+
a.set_title(title)
|
|
705
|
+
a.axis("off")
|
|
706
|
+
plt.show()
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
# Example call after prediction:
|
|
710
|
+
# probs = model.predict(X_val[:1])[0, ..., 0]
|
|
711
|
+
# analyze_errors(X_val[0], Y_val[0, ..., 0], probs)
|
|
712
|
+
|
|
713
|
+
# If instance segmentation is required, use Mask R-CNN from torchvision/detectron2/ultralytics
|
|
714
|
+
# when annotations are object instances rather than one semantic mask.
|
|
715
|
+
''',
|
|
716
|
+
},
|
|
717
|
+
{
|
|
718
|
+
"id": 11,
|
|
719
|
+
"task": "Классификация изображений с использованием Vision Transformer",
|
|
720
|
+
"description": "Fine-tune torchvision ViT and compare it with a small CNN.",
|
|
721
|
+
"code": r'''
|
|
722
|
+
# Task 11. Vision Transformer classification and CNN comparison.
|
|
723
|
+
# pip install torch torchvision
|
|
724
|
+
|
|
725
|
+
import torch
|
|
726
|
+
from torch import nn
|
|
727
|
+
from torch.utils.data import DataLoader, Subset
|
|
728
|
+
from torchvision import datasets, transforms, models
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
732
|
+
|
|
733
|
+
weights = models.ViT_B_16_Weights.DEFAULT
|
|
734
|
+
vit_tf = weights.transforms()
|
|
735
|
+
train_ds = datasets.CIFAR10(root="data", train=True, download=True, transform=vit_tf)
|
|
736
|
+
test_ds = datasets.CIFAR10(root="data", train=False, download=True, transform=vit_tf)
|
|
737
|
+
|
|
738
|
+
# Keep a small subset for an exam demo; remove Subset for full training.
|
|
739
|
+
train_ds = Subset(train_ds, range(2000))
|
|
740
|
+
test_ds = Subset(test_ds, range(500))
|
|
741
|
+
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
|
|
742
|
+
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=2)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def train_one(model, epochs=1, lr=1e-4):
|
|
746
|
+
model.to(device)
|
|
747
|
+
criterion = nn.CrossEntropyLoss()
|
|
748
|
+
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
|
|
749
|
+
for epoch in range(epochs):
|
|
750
|
+
model.train()
|
|
751
|
+
for x, y in train_loader:
|
|
752
|
+
x, y = x.to(device), y.to(device)
|
|
753
|
+
logits = model(x)
|
|
754
|
+
loss = criterion(logits, y)
|
|
755
|
+
optimizer.zero_grad()
|
|
756
|
+
loss.backward()
|
|
757
|
+
optimizer.step()
|
|
758
|
+
model.eval()
|
|
759
|
+
correct, total = 0, 0
|
|
760
|
+
with torch.no_grad():
|
|
761
|
+
for x, y in test_loader:
|
|
762
|
+
x, y = x.to(device), y.to(device)
|
|
763
|
+
pred = model(x).argmax(1)
|
|
764
|
+
correct += (pred == y).sum().item()
|
|
765
|
+
total += y.numel()
|
|
766
|
+
return correct / total
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
vit = models.vit_b_16(weights=weights)
|
|
770
|
+
for p in vit.parameters():
|
|
771
|
+
p.requires_grad = False
|
|
772
|
+
vit.heads.head = nn.Linear(vit.heads.head.in_features, 10)
|
|
773
|
+
vit_acc = train_one(vit, epochs=1, lr=1e-3)
|
|
774
|
+
print("ViT transfer acc:", vit_acc)
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
class SmallCNN(nn.Module):
|
|
778
|
+
def __init__(self):
|
|
779
|
+
super().__init__()
|
|
780
|
+
self.net = nn.Sequential(
|
|
781
|
+
nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
|
|
782
|
+
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)),
|
|
783
|
+
nn.Flatten(), nn.Linear(64, 10)
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
def forward(self, x):
|
|
787
|
+
return self.net(x)
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
cnn_acc = train_one(SmallCNN(), epochs=1, lr=1e-3)
|
|
791
|
+
print("Small CNN acc:", cnn_acc)
|
|
792
|
+
|
|
793
|
+
print("Comparison notes: ViT has global self-attention and benefits from pretraining; "
|
|
794
|
+
"CNN has locality/translation bias and is often cheaper on small data.")
|
|
795
|
+
''',
|
|
796
|
+
},
|
|
797
|
+
{
|
|
798
|
+
"id": 12,
|
|
799
|
+
"task": "Генерация текстовых описаний изображений",
|
|
800
|
+
"description": "CNN encoder + Transformer decoder skeleton and BLEU evaluation.",
|
|
801
|
+
"code": r'''
|
|
802
|
+
# Task 12. Image captioning: CNN encoder + Transformer decoder skeleton.
|
|
803
|
+
# pip install torch torchvision nltk pillow
|
|
804
|
+
|
|
805
|
+
import math
|
|
806
|
+
import torch
|
|
807
|
+
from torch import nn
|
|
808
|
+
from torchvision import models
|
|
809
|
+
from nltk.translate.bleu_score import corpus_bleu
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
class CNNEncoder(nn.Module):
|
|
816
|
+
def __init__(self, embed_dim=256):
|
|
817
|
+
super().__init__()
|
|
818
|
+
weights = models.ResNet18_Weights.DEFAULT
|
|
819
|
+
resnet = models.resnet18(weights=weights)
|
|
820
|
+
self.backbone = nn.Sequential(*list(resnet.children())[:-2])
|
|
821
|
+
self.proj = nn.Conv2d(512, embed_dim, kernel_size=1)
|
|
822
|
+
|
|
823
|
+
def forward(self, images):
|
|
824
|
+
feats = self.backbone(images) # B, 512, H/32, W/32
|
|
825
|
+
feats = self.proj(feats) # B, D, h, w
|
|
826
|
+
tokens = feats.flatten(2).permute(2, 0, 1) # S, B, D
|
|
827
|
+
return tokens
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
class PositionalEncoding(nn.Module):
|
|
831
|
+
def __init__(self, d_model, max_len=256):
|
|
832
|
+
super().__init__()
|
|
833
|
+
pe = torch.zeros(max_len, d_model)
|
|
834
|
+
pos = torch.arange(0, max_len).unsqueeze(1)
|
|
835
|
+
div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
|
|
836
|
+
pe[:, 0::2] = torch.sin(pos * div)
|
|
837
|
+
pe[:, 1::2] = torch.cos(pos * div)
|
|
838
|
+
self.register_buffer("pe", pe.unsqueeze(1))
|
|
839
|
+
|
|
840
|
+
def forward(self, x):
|
|
841
|
+
return x + self.pe[: x.size(0)]
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
class CaptionModel(nn.Module):
|
|
845
|
+
def __init__(self, vocab_size, embed_dim=256, nhead=8, layers=3, pad_id=0):
|
|
846
|
+
super().__init__()
|
|
847
|
+
self.encoder = CNNEncoder(embed_dim)
|
|
848
|
+
self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
|
|
849
|
+
self.pos = PositionalEncoding(embed_dim)
|
|
850
|
+
dec_layer = nn.TransformerDecoderLayer(embed_dim, nhead, dim_feedforward=512)
|
|
851
|
+
self.decoder = nn.TransformerDecoder(dec_layer, num_layers=layers)
|
|
852
|
+
self.out = nn.Linear(embed_dim, vocab_size)
|
|
853
|
+
self.pad_id = pad_id
|
|
854
|
+
|
|
855
|
+
def forward(self, images, captions_in):
|
|
856
|
+
memory = self.encoder(images)
|
|
857
|
+
tgt = self.pos(self.embed(captions_in).transpose(0, 1))
|
|
858
|
+
seq_len = tgt.size(0)
|
|
859
|
+
causal_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(tgt.device)
|
|
860
|
+
decoded = self.decoder(tgt, memory, tgt_mask=causal_mask)
|
|
861
|
+
return self.out(decoded).transpose(0, 1)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
# Training idea:
|
|
865
|
+
# captions_in = [BOS, w1, w2, ...]
|
|
866
|
+
# captions_out = [w1, w2, ..., EOS]
|
|
867
|
+
# logits = model(images, captions_in)
|
|
868
|
+
# loss = CrossEntropyLoss(ignore_index=pad_id)(logits.reshape(-1, vocab), captions_out.reshape(-1))
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def evaluate_bleu(references, hypotheses):
|
|
872
|
+
# references: list[list[list[str]]], e.g. [[["a", "cat", "sits"]]]
|
|
873
|
+
# hypotheses: list[list[str]], e.g. [["a", "cat", "sits"]]
|
|
874
|
+
return corpus_bleu(references, hypotheses)
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
refs = [[["a", "cat", "on", "a", "sofa"]], [["a", "dog", "runs"]]]
|
|
878
|
+
hyps = [["a", "cat", "on", "sofa"], ["a", "dog", "runs"]]
|
|
879
|
+
print("BLEU example:", evaluate_bleu(refs, hyps))
|
|
880
|
+
''',
|
|
881
|
+
},
|
|
882
|
+
{
|
|
883
|
+
"id": 13,
|
|
884
|
+
"task": "Ручная реализация 2D-свёртки и фильтрации",
|
|
885
|
+
"description": "NumPy convolution with stride, padding, several kernels, comparison with cv2.filter2D and timing.",
|
|
886
|
+
"code": r'''
|
|
887
|
+
# Task 13. Manual 2D convolution and filters.
|
|
888
|
+
# pip install opencv-python matplotlib scikit-image
|
|
889
|
+
|
|
890
|
+
import time
|
|
891
|
+
import cv2
|
|
892
|
+
import numpy as np
|
|
893
|
+
import matplotlib.pyplot as plt
|
|
894
|
+
from skimage import data
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def conv2d_numpy(image, kernel, stride=1, padding=0):
|
|
898
|
+
image = image.astype(np.float32)
|
|
899
|
+
kernel = np.asarray(kernel, dtype=np.float32)
|
|
900
|
+
if padding:
|
|
901
|
+
image = np.pad(image, ((padding, padding), (padding, padding)), mode="constant")
|
|
902
|
+
kh, kw = kernel.shape
|
|
903
|
+
oh = (image.shape[0] - kh) // stride + 1
|
|
904
|
+
ow = (image.shape[1] - kw) // stride + 1
|
|
905
|
+
out = np.zeros((oh, ow), dtype=np.float32)
|
|
906
|
+
flipped = np.flipud(np.fliplr(kernel))
|
|
907
|
+
for y in range(oh):
|
|
908
|
+
for x in range(ow):
|
|
909
|
+
patch = image[y * stride:y * stride + kh, x * stride:x * stride + kw]
|
|
910
|
+
out[y, x] = np.sum(patch * flipped)
|
|
911
|
+
return out
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
gray = data.camera().astype(np.float32)
|
|
915
|
+
gray = cv2.resize(gray, (256, 256), interpolation=cv2.INTER_AREA)
|
|
916
|
+
|
|
917
|
+
kernels = {
|
|
918
|
+
"blur": np.ones((3, 3), dtype=np.float32) / 9,
|
|
919
|
+
"sharpen": np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32),
|
|
920
|
+
"sobel_x": np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=np.float32),
|
|
921
|
+
"sobel_y": np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=np.float32),
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
fig, ax = plt.subplots(1, len(kernels) + 1, figsize=(14, 4))
|
|
925
|
+
ax[0].imshow(gray, cmap="gray")
|
|
926
|
+
ax[0].set_title("input")
|
|
927
|
+
ax[0].axis("off")
|
|
928
|
+
|
|
929
|
+
for i, (name, kernel) in enumerate(kernels.items(), start=1):
|
|
930
|
+
t0 = time.perf_counter()
|
|
931
|
+
manual = conv2d_numpy(gray, kernel, stride=1, padding=1)
|
|
932
|
+
t_manual = time.perf_counter() - t0
|
|
933
|
+
|
|
934
|
+
t0 = time.perf_counter()
|
|
935
|
+
cv = cv2.filter2D(gray, ddepth=-1, kernel=kernel)
|
|
936
|
+
t_cv = time.perf_counter() - t0
|
|
937
|
+
|
|
938
|
+
diff = np.mean(np.abs(manual - cv))
|
|
939
|
+
print(f"{name:8s} manual={t_manual:.4f}s, cv2={t_cv:.6f}s, mean_abs_diff={diff:.4f}")
|
|
940
|
+
ax[i].imshow(manual, cmap="gray")
|
|
941
|
+
ax[i].set_title(name)
|
|
942
|
+
ax[i].axis("off")
|
|
943
|
+
|
|
944
|
+
plt.tight_layout()
|
|
945
|
+
plt.show()
|
|
946
|
+
''',
|
|
947
|
+
},
|
|
948
|
+
{
|
|
949
|
+
"id": 14,
|
|
950
|
+
"task": "Ручная реализация pooling и простого CNN-блока",
|
|
951
|
+
"description": "Manual max/average pooling, Conv -> ReLU -> Pool block and comparison with PyTorch.",
|
|
952
|
+
"code": r'''
|
|
953
|
+
# Task 14. Manual pooling and simple CNN block.
|
|
954
|
+
# pip install torch opencv-python scikit-image
|
|
955
|
+
|
|
956
|
+
import numpy as np
|
|
957
|
+
import torch
|
|
958
|
+
import torch.nn.functional as F
|
|
959
|
+
from skimage import data
|
|
960
|
+
import cv2
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def pool2d_numpy(x, kernel_size=2, stride=2, mode="max"):
|
|
964
|
+
h, w = x.shape
|
|
965
|
+
oh = (h - kernel_size) // stride + 1
|
|
966
|
+
ow = (w - kernel_size) // stride + 1
|
|
967
|
+
out = np.zeros((oh, ow), dtype=x.dtype)
|
|
968
|
+
for y in range(oh):
|
|
969
|
+
for x0 in range(ow):
|
|
970
|
+
patch = x[y * stride:y * stride + kernel_size, x0 * stride:x0 * stride + kernel_size]
|
|
971
|
+
out[y, x0] = patch.max() if mode == "max" else patch.mean()
|
|
972
|
+
return out
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
def conv2d_valid(x, kernel):
|
|
976
|
+
kh, kw = kernel.shape
|
|
977
|
+
oh, ow = x.shape[0] - kh + 1, x.shape[1] - kw + 1
|
|
978
|
+
out = np.zeros((oh, ow), dtype=np.float32)
|
|
979
|
+
for y in range(oh):
|
|
980
|
+
for x0 in range(ow):
|
|
981
|
+
out[y, x0] = np.sum(x[y:y+kh, x0:x0+kw] * kernel)
|
|
982
|
+
return out
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
img = data.camera().astype(np.float32) / 255.0
|
|
986
|
+
img = cv2.resize(img, (64, 64), interpolation=cv2.INTER_AREA)
|
|
987
|
+
kernel = np.array([[1, 0, -1], [1, 0, -1], [1, 0, -1]], dtype=np.float32)
|
|
988
|
+
|
|
989
|
+
conv = conv2d_valid(img, kernel)
|
|
990
|
+
relu = np.maximum(conv, 0)
|
|
991
|
+
max_pool = pool2d_numpy(relu, kernel_size=2, stride=2, mode="max")
|
|
992
|
+
avg_pool = pool2d_numpy(relu, kernel_size=2, stride=2, mode="avg")
|
|
993
|
+
|
|
994
|
+
print("input:", img.shape)
|
|
995
|
+
print("after conv:", conv.shape)
|
|
996
|
+
print("after relu:", relu.shape)
|
|
997
|
+
print("after max pool:", max_pool.shape)
|
|
998
|
+
print("after avg pool:", avg_pool.shape)
|
|
999
|
+
|
|
1000
|
+
# Compare pooling with PyTorch for the same input.
|
|
1001
|
+
t = torch.tensor(relu)[None, None, :, :]
|
|
1002
|
+
torch_max = F.max_pool2d(t, kernel_size=2, stride=2).squeeze().numpy()
|
|
1003
|
+
torch_avg = F.avg_pool2d(t, kernel_size=2, stride=2).squeeze().numpy()
|
|
1004
|
+
print("max pool diff:", np.abs(torch_max - max_pool).max())
|
|
1005
|
+
print("avg pool diff:", np.abs(torch_avg - avg_pool).max())
|
|
1006
|
+
''',
|
|
1007
|
+
},
|
|
1008
|
+
{
|
|
1009
|
+
"id": 15,
|
|
1010
|
+
"task": "Ручная реализация self-attention и позиционного кодирования",
|
|
1011
|
+
"description": "Patchify, scaled dot-product attention, sinusoidal positional encoding and tensor shapes.",
|
|
1012
|
+
"code": r'''
|
|
1013
|
+
# Task 15. Patchify, positional encoding and self-attention.
|
|
1014
|
+
# pip install torch
|
|
1015
|
+
|
|
1016
|
+
import math
|
|
1017
|
+
import torch
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def patchify(images, patch_size=16):
|
|
1021
|
+
# images: B, C, H, W
|
|
1022
|
+
b, c, h, w = images.shape
|
|
1023
|
+
assert h % patch_size == 0 and w % patch_size == 0
|
|
1024
|
+
patches = images.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
|
|
1025
|
+
# B, C, nH, nW, p, p -> B, N, C*p*p
|
|
1026
|
+
patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
|
|
1027
|
+
return patches.view(b, -1, c * patch_size * patch_size)
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def sinusoidal_positional_encoding(seq_len, dim, device="cpu"):
|
|
1031
|
+
pe = torch.zeros(seq_len, dim, device=device)
|
|
1032
|
+
pos = torch.arange(seq_len, device=device).unsqueeze(1)
|
|
1033
|
+
div = torch.exp(torch.arange(0, dim, 2, device=device) * (-math.log(10000.0) / dim))
|
|
1034
|
+
pe[:, 0::2] = torch.sin(pos * div)
|
|
1035
|
+
pe[:, 1::2] = torch.cos(pos * div)
|
|
1036
|
+
return pe
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
def scaled_dot_product_attention(q, k, v, mask=None):
|
|
1040
|
+
# q, k, v: B, heads, N, d
|
|
1041
|
+
d = q.size(-1)
|
|
1042
|
+
scores = q @ k.transpose(-2, -1) / math.sqrt(d)
|
|
1043
|
+
if mask is not None:
|
|
1044
|
+
scores = scores.masked_fill(mask == 0, float("-inf"))
|
|
1045
|
+
attn = torch.softmax(scores, dim=-1)
|
|
1046
|
+
out = attn @ v
|
|
1047
|
+
return out, attn
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def split_heads(x, num_heads):
|
|
1051
|
+
# x: B, N, D -> B, heads, N, D/heads
|
|
1052
|
+
b, n, d = x.shape
|
|
1053
|
+
assert d % num_heads == 0
|
|
1054
|
+
return x.view(b, n, num_heads, d // num_heads).transpose(1, 2)
|
|
1055
|
+
|
|
1056
|
+
|
|
1057
|
+
B, C, H, W = 2, 3, 32, 32
|
|
1058
|
+
patch_size = 8
|
|
1059
|
+
embed_dim = 64
|
|
1060
|
+
num_heads = 4
|
|
1061
|
+
|
|
1062
|
+
images = torch.randn(B, C, H, W)
|
|
1063
|
+
patches = patchify(images, patch_size=patch_size)
|
|
1064
|
+
proj = torch.nn.Linear(patches.size(-1), embed_dim)
|
|
1065
|
+
tokens = proj(patches)
|
|
1066
|
+
tokens = tokens + sinusoidal_positional_encoding(tokens.size(1), embed_dim, tokens.device)
|
|
1067
|
+
|
|
1068
|
+
q_proj = torch.nn.Linear(embed_dim, embed_dim)
|
|
1069
|
+
k_proj = torch.nn.Linear(embed_dim, embed_dim)
|
|
1070
|
+
v_proj = torch.nn.Linear(embed_dim, embed_dim)
|
|
1071
|
+
|
|
1072
|
+
q = split_heads(q_proj(tokens), num_heads)
|
|
1073
|
+
k = split_heads(k_proj(tokens), num_heads)
|
|
1074
|
+
v = split_heads(v_proj(tokens), num_heads)
|
|
1075
|
+
out, attn = scaled_dot_product_attention(q, k, v)
|
|
1076
|
+
|
|
1077
|
+
print("images:", images.shape)
|
|
1078
|
+
print("patches:", patches.shape)
|
|
1079
|
+
print("tokens:", tokens.shape)
|
|
1080
|
+
print("Q/K/V:", q.shape, k.shape, v.shape)
|
|
1081
|
+
print("attention matrix:", attn.shape)
|
|
1082
|
+
print("attention output:", out.shape)
|
|
1083
|
+
|
|
1084
|
+
# Compare with PyTorch MultiheadAttention on the same token shape.
|
|
1085
|
+
mha = torch.nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)
|
|
1086
|
+
lib_out, lib_attn = mha(tokens, tokens, tokens)
|
|
1087
|
+
print("library output:", lib_out.shape)
|
|
1088
|
+
print("library attention:", lib_attn.shape)
|
|
1089
|
+
''',
|
|
1090
|
+
},
|
|
1091
|
+
]
|