circe 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/circe.cpp ADDED
@@ -0,0 +1,404 @@
1
+ #include <ruby.h>
2
+
3
+ #if 0
4
+ void estimateHeadPose(cv::Mat& image, CameraModel& camera, const FaceDetection::Landmark& landmark)
5
+ {
6
+ /* reference: https://qiita.com/TaroYamada/items/e3f3d0ea4ecc0a832fac */
7
+ /* reference: https://github.com/spmallick/learnopencv/blob/master/HeadPose/headPose.cpp */
8
+ static const std::vector<cv::Point3f> face_object_point_list = {
9
+ { 0.0f, 0.0f, 0.0f }, // Nose
10
+ { -225.0f, 170.0f, -135.0f }, // Left eye
11
+ { 225.0f, 170.0f, -135.0f }, // Right eye
12
+ { -150.0f, -150.0f, -125.0f }, // Left lip
13
+ { 150.0f, -150.0f, -125.0f }, // Right lip
14
+ };
15
+
16
+ static const std::vector<cv::Point3f> face_object_point_for_pnp_list = {
17
+ face_object_point_list[0], face_object_point_list[0], // Nose
18
+ face_object_point_list[1], face_object_point_list[1], // Left eye
19
+ face_object_point_list[2], face_object_point_list[2], // Right eye
20
+ face_object_point_list[3], face_object_point_list[3], // Left lip
21
+ face_object_point_list[4], face_object_point_list[4], // Righ lip
22
+ };
23
+
24
+ std::vector<cv::Point2f> face_image_point_list;
25
+ face_image_point_list.push_back(landmark[2]); // Nose
26
+ face_image_point_list.push_back(landmark[2]); // Nose
27
+ face_image_point_list.push_back(landmark[0]); // Left eye
28
+ face_image_point_list.push_back(landmark[0]); // Left eye
29
+ face_image_point_list.push_back(landmark[1]); // Right eye
30
+ face_image_point_list.push_back(landmark[1]); // Right eye
31
+ face_image_point_list.push_back(landmark[3]); // Left lip
32
+ face_image_point_list.push_back(landmark[3]); // Left lip
33
+ face_image_point_list.push_back(landmark[4]); // Right lip
34
+ face_image_point_list.push_back(landmark[4]); // Right lip
35
+
36
+ cv::Mat rvec = cv::Mat_<float>(3, 1);
37
+ cv::Mat tvec = cv::Mat_<float>(3, 1);
38
+ cv::solvePnP(face_object_point_for_pnp_list, face_image_point_list, camera.K, camera.dist_coeff, rvec, tvec, false, cv::SOLVEPNP_ITERATIVE);
39
+
40
+ char text[128];
41
+ snprintf(text, sizeof(text), "Pitch = %-+4.0f, Yaw = %-+4.0f, Roll = %-+4.0f", Rad2Deg(rvec.at<float>(0, 0)), Rad2Deg(rvec.at<float>(1, 0)), Rad2Deg(rvec.at<float>(2, 0)));
42
+ CommonHelper::drawText(image, text, cv::Point(10, 10), 0.7, 3, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255), false);
43
+
44
+ std::vector<cv::Point3f> nose_end_point3D = { { 0.0f, 0.0f, 500.0f } };
45
+ std::vector<cv::Point2f> nose_end_point2D;
46
+ cv::projectPoints(nose_end_point3D, rvec, tvec,
47
+ camera.K, camera.dist_coeff, nose_end_point2D);
48
+ cv::arrowedLine(image, face_image_point_list[0], nose_end_point2D[0],
49
+ cv::Scalar(0, 255, 0), 5);
50
+
51
+
52
+ /* Calculate Euler Angle */
53
+ cv::Mat R;
54
+ cv::Rodrigues(rvec, R);
55
+ cv::Mat projMat = (cv::Mat_<double>(3, 4) <<
56
+ R.at<float>(0, 0), R.at<float>(0, 1), R.at<float>(0, 2), 0,
57
+ R.at<float>(1, 0), R.at<float>(1, 1), R.at<float>(1, 2), 0,
58
+ R.at<float>(2, 0), R.at<float>(2, 1), R.at<float>(2, 2), 0);
59
+
60
+ cv::Mat K, R2, tvec2, R_x, R_y, R_z, eulerAngles;
61
+ cv::decomposeProjectionMatrix(projMat, K, R, tvec2,
62
+ R_x, R_y, R_z, eulerAngles);
63
+ double pitch = eulerAngles.at<double>(0, 0);
64
+ double yaw = eulerAngles.at<double>(1, 0);
65
+ double roll = eulerAngles.at<double>(2, 0);
66
+ snprintf(text, sizeof(text), "X = %-+4.0f, Y = %-+4.0f, Z = %-+4.0f", pitch, yaw, roll);
67
+ CommonHelper::drawText(image, text, cv::Point(10, 40), 0.7, 3, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255), false);
68
+ }
69
+ #endif
70
+
71
+ #if 0
72
+ CameraModel camera;
73
+ camera.SetIntrinsic(image_input.cols, image_input.rows, FocalLength(image_input.cols, kFovDeg));
74
+ camera.SetDist({ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f });
75
+ camera.SetExtrinsic({ 0.0f, 0.0f, 0.0f }, /* rvec [deg] */
76
+ { 0.0f, 0.0f, 0.0f }, true); /* tvec (in world coordinate) */
77
+
78
+ estimateHeadPose(image_input, camera, face_list[i].second);
79
+
80
+ #endif
81
+
82
+ #define IF_UNDEF(a, b) \
83
+ ((a) == Qundef) ? (b) : (a)
84
+
85
+
86
+ static VALUE cCirce = Qundef;
87
+ static VALUE eCirceError = Qundef;
88
+
89
+ #include <tuple>
90
+ #include <string>
91
+ #include <chrono>
92
+ #include <opencv2/highgui.hpp>
93
+ #include <opencv2/imgproc.hpp>
94
+
95
+ #include "yolo.h"
96
+ #include "yunet.h"
97
+
98
+ using namespace std;
99
+ using namespace cv;
100
+
101
+
102
+
103
+ // Text parameters.
104
+ const float FONT_SCALE = 0.7;
105
+ const int FONT_FACE = FONT_HERSHEY_SIMPLEX;
106
+ const int THICKNESS = 1;
107
+
108
+ // Colors.
109
+ cv::Scalar BLACK = cv::Scalar(0,0,0);
110
+ cv::Scalar BLUE = cv::Scalar(255, 178, 50);
111
+ cv::Scalar YELLOW = cv::Scalar(0, 255, 255);
112
+ cv::Scalar RED = cv::Scalar(0,0,255);
113
+
114
+
115
+ static ID id_debug;
116
+ static ID id_face;
117
+ static ID id_classify;
118
+ static ID id_class;
119
+ static ID id_png;
120
+ static ID id_jpg;
121
+
122
+
123
+ static Yolo *yolo;
124
+ static YuNet *yunet;
125
+
126
+
127
+
128
+ static void
129
+ draw_label(cv::Mat& img, string label, Point& origin,
130
+ Scalar& fgcolor, Scalar& bgcolor, int thickness = 1)
131
+ {
132
+ int baseLine;
133
+ cv::Size label_size =
134
+ cv::getTextSize(label, FONT_FACE, FONT_SCALE, thickness, &baseLine);
135
+
136
+ Point a = { origin.x, max(origin.y, label_size.height) };
137
+ Point b = { a.x + label_size.width,
138
+ a.y + label_size.height + baseLine };
139
+ Point t = { a.x, a.y + label_size.height };
140
+
141
+ cv::rectangle(img, a, b, bgcolor, cv::FILLED);
142
+ cv::putText(img, label, t, FONT_FACE, FONT_SCALE, fgcolor, THICKNESS);
143
+ }
144
+
145
+ static void
146
+ draw_labelbox(cv::Mat& img, string label, Rect& box,
147
+ Scalar& framecolor = BLUE, Scalar& textcolor = BLACK,
148
+ int thickness = 1) {
149
+
150
+ Point a = { box.x, box.y };
151
+ Point b = { box.x + box.width, box.y + box.height };
152
+
153
+ cv::rectangle(img, a, b, framecolor, thickness);
154
+ draw_label(img, label, a, textcolor, framecolor);
155
+ }
156
+
157
+
158
+
159
+
160
+
161
+ void
162
+ yunet_process_features(vector<YuNet::Face>& faces,
163
+ Mat& img, VALUE v_features, int *state)
164
+ {
165
+ for (int i = 0; i < faces.size(); i++) {
166
+ Rect box = faces[i].first;
167
+ YuNet::Landmark lmark = faces[i].second;
168
+
169
+ VALUE v_type = ID2SYM(id_face);
170
+ VALUE v_box = rb_ary_new_from_args(4,
171
+ INT2NUM(box.x ), INT2NUM(box.y ),
172
+ INT2NUM(box.width), INT2NUM(box.height));
173
+ VALUE v_landmark = rb_ary_new_from_args(5,
174
+ rb_ary_new_from_args(2, INT2NUM(lmark[0].x),
175
+ INT2NUM(lmark[0].y)),
176
+ rb_ary_new_from_args(2, INT2NUM(lmark[1].x),
177
+ INT2NUM(lmark[1].y)),
178
+ rb_ary_new_from_args(2, INT2NUM(lmark[2].x),
179
+ INT2NUM(lmark[2].y)),
180
+ rb_ary_new_from_args(2, INT2NUM(lmark[3].x),
181
+ INT2NUM(lmark[3].y)),
182
+ rb_ary_new_from_args(2, INT2NUM(lmark[4].x),
183
+ INT2NUM(lmark[4].y)));
184
+ VALUE v_feature = rb_ary_new_from_args(3, v_type, v_box,
185
+ v_landmark);
186
+ rb_ary_push(v_features, v_feature);
187
+
188
+ if (!img.empty() && rb_block_given_p()) {
189
+ VALUE v_annotation= rb_yield_splat(v_feature);
190
+ VALUE v_label = Qnil;
191
+ VALUE v_color = ULONG2NUM(0x0000ff);
192
+ VALUE v_thickness = INT2NUM(1);
193
+
194
+ switch (TYPE(v_annotation)) {
195
+ case T_NIL:
196
+ break;
197
+ case T_HASH:
198
+ break;
199
+ case T_ARRAY:
200
+ switch(RARRAY_LENINT(v_annotation)) {
201
+ default:
202
+ case 3: v_thickness = RARRAY_AREF(v_annotation, 2);
203
+ case 2: v_color = RARRAY_AREF(v_annotation, 1);
204
+ case 1: v_label = RARRAY_AREF(v_annotation, 0);
205
+ case 0: break;
206
+ }
207
+ break;
208
+ case T_STRING:
209
+ v_label = v_annotation;
210
+ break;
211
+ }
212
+
213
+ if (! NIL_P(v_label)) {
214
+ string label = StringValueCStr(v_label);
215
+ long rgb = NUM2ULONG(v_color);
216
+ int thickness = NUM2INT(v_thickness);
217
+ Scalar color = cv::Scalar((rgb >> 0) & 0xFF,
218
+ (rgb >> 8) & 0xFF,
219
+ (rgb >> 16) & 0xFF);
220
+ draw_labelbox(img, label, box, color, BLACK, thickness);
221
+
222
+ for (const auto& p : lmark) {
223
+ cv::circle(img, p, 3, cv::Scalar(255, 0, 0), 2);
224
+ }
225
+ }
226
+ }
227
+ }
228
+ }
229
+
230
+
231
+
232
+ void
233
+ yolo_process_features(vector<Yolo::Item>& items,
234
+ Mat& img, VALUE v_features, int *state)
235
+ {
236
+ for (int i = 0; i < items.size(); i++) {
237
+ string name = std::get<0>(items[i]);
238
+ float confidence = std::get<1>(items[i]);
239
+ Rect box = std::get<2>(items[i]);
240
+
241
+ VALUE v_type = ID2SYM(id_class);
242
+ VALUE v_name = rb_str_new(name.c_str(), name.size());
243
+ VALUE v_confidence = DBL2NUM(confidence);
244
+ VALUE v_box = rb_ary_new_from_args(4,
245
+ INT2NUM(box.x ), INT2NUM(box.y ),
246
+ INT2NUM(box.width), INT2NUM(box.height));
247
+ VALUE v_feature = rb_ary_new_from_args(4, v_type, v_box,
248
+ v_name, v_confidence);
249
+ rb_ary_push(v_features, v_feature);
250
+
251
+ if (rb_block_given_p()) {
252
+ VALUE v_annotation = rb_yield_splat(v_feature);
253
+
254
+ VALUE v_label = Qnil;
255
+ VALUE v_color = ULONG2NUM(0x0000ff);
256
+ VALUE v_thickness = INT2NUM(1);
257
+
258
+ switch (TYPE(v_annotation)) {
259
+ case T_NIL:
260
+ break;
261
+ case T_HASH:
262
+ break;
263
+ case T_ARRAY:
264
+ switch(RARRAY_LENINT(v_annotation)) {
265
+ default:
266
+ case 3: v_thickness = RARRAY_AREF(v_annotation, 2);
267
+ case 2: v_color = RARRAY_AREF(v_annotation, 1);
268
+ case 1: v_label = RARRAY_AREF(v_annotation, 0);
269
+ case 0: break;
270
+ }
271
+ break;
272
+ case T_STRING:
273
+ v_label = v_annotation;
274
+ break;
275
+ }
276
+
277
+ if (! NIL_P(v_label)) {
278
+ string label = StringValueCStr(v_label);
279
+ long rgb = NUM2ULONG(v_color);
280
+ int thickness = NUM2INT(v_thickness);
281
+ Scalar color = cv::Scalar((rgb >> 0) & 0xFF,
282
+ (rgb >> 8) & 0xFF,
283
+ (rgb >> 16) & 0xFF);
284
+ draw_labelbox(img, label, box, color, BLACK, thickness);
285
+ }
286
+ }
287
+ }
288
+ }
289
+
290
+
291
+
292
+ static VALUE
293
+ circe_m_analyze(int argc, VALUE* argv, VALUE self) {
294
+ // Retrieve arguments
295
+ VALUE v_imgstr, v_format, v_opts;
296
+ VALUE kwargs[3];
297
+ rb_scan_args(argc, argv, "11:", &v_imgstr, &v_format, &v_opts);
298
+ rb_get_kwargs(v_opts, (ID[]){ id_debug, id_face, id_classify },
299
+ 0, 3, kwargs);
300
+ VALUE v_debug = IF_UNDEF(kwargs[0], Qfalse);
301
+ VALUE v_face = IF_UNDEF(kwargs[1], Qfalse);
302
+ VALUE v_classify = IF_UNDEF(kwargs[2], Qfalse);
303
+
304
+ VALUE v_features = rb_ary_new();
305
+ VALUE v_image = Qnil;
306
+
307
+ if (! NIL_P(v_format)) {
308
+ Check_Type(v_format, T_SYMBOL);
309
+ ID i_format = rb_sym2id(v_format);
310
+ if ((i_format != id_png) && (i_format != id_jpg))
311
+ rb_raise(rb_eArgError, "format must be :png, :jpg or nil");
312
+ }
313
+
314
+ if (!RTEST(v_face) && !RTEST(v_classify)) {
315
+ v_face = v_classify = Qtrue;
316
+ }
317
+
318
+
319
+ // Load image.
320
+ Mat i_img = cv::imdecode(cv::Mat(1, RSTRING_LEN(v_imgstr), CV_8UC1,
321
+ (unsigned char *)RSTRING_PTR(v_imgstr)),
322
+ IMREAD_UNCHANGED);
323
+ Mat o_img = NIL_P(v_format) ? cv::Mat() : i_img.clone();
324
+
325
+ // Processing
326
+ std::chrono::time_point<std::chrono::system_clock> start_time, end_time;
327
+ std::chrono::duration<double> duration;
328
+ int state = 0;
329
+
330
+ start_time = std::chrono::system_clock::now();
331
+
332
+ if (RTEST(v_classify)) {
333
+ vector<Yolo::Item> items;
334
+ yolo->process(i_img, items);
335
+ yolo_process_features(items, o_img, v_features, &state);
336
+ if (state) goto exception;
337
+ }
338
+
339
+ if (RTEST(v_face)) {
340
+ vector<YuNet::Face> faces;
341
+ yunet->process(i_img, faces);
342
+ yunet_process_features(faces, o_img, v_features, &state);
343
+ if (state) goto exception;
344
+ }
345
+
346
+ end_time = std::chrono::system_clock::now();
347
+ duration = end_time - start_time;
348
+
349
+
350
+ if (! NIL_P(v_format)) {
351
+ if (RTEST(v_debug)) {
352
+ double ms = duration / 1.0ms;
353
+ string label = cv::format("Inference time : %0.2f ms", ms);
354
+ cv::putText(o_img, label, Point(20, 40),
355
+ FONT_FACE, FONT_SCALE, RED);
356
+ }
357
+
358
+ ID i_format = rb_sym2id(v_format);
359
+ string format;
360
+ if (i_format == id_png) { format = ".png"; }
361
+ else if (i_format == id_jpg) { format = ".jpg"; }
362
+
363
+ std::vector<uchar> buf;
364
+ cv::imencode(format, o_img, buf);
365
+ v_image = rb_str_new(reinterpret_cast<char*>(buf.data()), buf.size());
366
+ }
367
+ return rb_ary_new_from_args(2, v_features, v_image);
368
+
369
+ exception:
370
+ i_img.release();
371
+ o_img.release();
372
+ rb_jump_tag(state);
373
+ }
374
+
375
+
376
+
377
+
378
+ extern "C"
379
+ void Init_core(void) {
380
+ /* Main classes */
381
+ cCirce = rb_define_class("Circe", rb_cObject);
382
+ eCirceError = rb_define_class_under(cCirce, "Error", rb_eStandardError);
383
+ // myclass = rb_const_get(mymodule, sym_myclass);
384
+
385
+ VALUE v_onnx_yolo = rb_const_get(cCirce, rb_intern("ONNX_YOLO"));
386
+ VALUE v_onnx_yunet = rb_const_get(cCirce, rb_intern("ONNX_YUNET"));
387
+
388
+ static Yolo _yolo = { StringValueCStr(v_onnx_yolo ) };
389
+ static YuNet _yunet = { StringValueCStr(v_onnx_yunet) };
390
+
391
+ yolo = &_yolo;
392
+ yunet = &_yunet;
393
+
394
+
395
+ id_debug = rb_intern_const("debug" );
396
+ id_face = rb_intern_const("face" );
397
+ id_classify = rb_intern_const("classify");
398
+ id_class = rb_intern_const("class" );
399
+ id_png = rb_intern_const("png" );
400
+ id_jpg = rb_intern_const("jpg" );
401
+
402
+
403
+ rb_define_method(cCirce, "analyze", circe_m_analyze, -1);
404
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'mkmf'
2
+
3
+ $PKGCONFIG =
4
+ case RbConfig::CONFIG['host_os']
5
+ when /bsd/ then '/usr/local/bin/pkgconf'
6
+ end
7
+
8
+
9
+ cflags, ldflags, libs = pkg_config('opencv4')
10
+
11
+ $LDFLAGS += " #{ldflags} #{libs}"
12
+ $INCFLAGS += " #{cflags}"
13
+ $CXXFLAGS += "-std=c++17"
14
+
15
+ create_makefile("circe/core")
data/ext/yolo.cpp ADDED
@@ -0,0 +1,94 @@
1
+ #include <tuple>
2
+ #include <string>
3
+ #include <opencv2/imgproc.hpp>
4
+ #include <opencv2/dnn.hpp>
5
+
6
+ #include "yolo.h"
7
+
8
+
9
+ Yolo::Yolo(const std::string& model) {
10
+ net = cv::dnn::readNetFromONNX(model);
11
+ }
12
+
13
+
14
+ void Yolo::process(cv::Mat &img, std::vector<Yolo::Item> &items) {
15
+
16
+ // Pre-process
17
+ cv::Mat blob;
18
+ std::vector<cv::Mat> outputs;
19
+
20
+ cv::dnn::blobFromImage(img, blob, 1./255.,
21
+ cv::Size(INPUT_WIDTH, INPUT_HEIGHT),
22
+ cv::Scalar(), true, false);
23
+
24
+ // Process
25
+ net.setInput(blob);
26
+ net.forward(outputs, net.getUnconnectedOutLayersNames());
27
+
28
+ // Post-process
29
+ std::vector<int> class_ids;
30
+ std::vector<float> confidences;
31
+ std::vector<cv::Rect> boxes;
32
+
33
+ // Resizing factor.
34
+ float x_factor = img.cols / INPUT_WIDTH;
35
+ float y_factor = img.rows / INPUT_HEIGHT;
36
+
37
+ float *data = (float *)outputs[0].data;
38
+ const int dimensions = 85;
39
+
40
+ // 25200 for default size 640.
41
+ const int rows = 25200;
42
+ // Iterate through 25200 detections.
43
+ for (int i = 0; i < rows; ++i) {
44
+ float confidence = data[4];
45
+
46
+ // Discard bad detections and continue.
47
+ if (confidence >= CONFIDENCE_THRESHOLD) {
48
+ float *classes_scores = data + 5;
49
+
50
+ // Create a 1x85 Mat and store class scores of 80 classes.
51
+ cv::Mat scores(1, names.size(), CV_32FC1, classes_scores);
52
+
53
+ // Perform minMaxLoc and acquire the index of best class score.
54
+ cv::Point class_id;
55
+ double max_class_score;
56
+ minMaxLoc(scores, 0, &max_class_score, 0, &class_id);
57
+
58
+ // Continue if the class score is above the threshold.
59
+ if (max_class_score > SCORE_THRESHOLD) {
60
+ // Store class ID and confidence in the pre-defined
61
+ // respective vectors.
62
+ float cx = data[0]; // Center
63
+ float cy = data[1];
64
+ float w = data[2]; // Box dimension
65
+ float h = data[3];
66
+
67
+ // Bounding box coordinates.
68
+ int left = int((cx - 0.5 * w) * x_factor);
69
+ int top = int((cy - 0.5 * h) * y_factor);
70
+ int width = int(w * x_factor);
71
+ int height = int(h * y_factor);
72
+
73
+ // Store good detections in the boxes vector.
74
+ confidences.push_back(confidence);
75
+ class_ids.push_back(class_id.x);
76
+ boxes.push_back(cv::Rect(left, top, width, height));
77
+ }
78
+ }
79
+ // Jump to the next row.
80
+ data += 85;
81
+ }
82
+
83
+ // Perform Non-Maximum Suppression and draw predictions.
84
+ std::vector<int> indices;
85
+ cv::dnn::NMSBoxes(boxes, confidences,
86
+ SCORE_THRESHOLD, NMS_THRESHOLD, indices);
87
+
88
+ items.clear();
89
+
90
+ for (int i = 0; i < indices.size(); i++) {
91
+ int idx = indices[i];
92
+ items.push_back({ names[class_ids[idx]],confidences[idx],boxes[idx] });
93
+ }
94
+ }
data/ext/yolo.h ADDED
@@ -0,0 +1,48 @@
1
+ #ifndef __YOLO__
2
+ #define __YOLO__
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <tuple>
7
+
8
+ #include <opencv2/dnn.hpp>
9
+
10
+
11
+ class Yolo
12
+ {
13
+ public:
14
+ typedef std::tuple<std::string, float, cv::Rect> Item;
15
+
16
+ private:
17
+ static constexpr float INPUT_WIDTH = 640.0;
18
+ static constexpr float INPUT_HEIGHT = 640.0;
19
+ static constexpr float SCORE_THRESHOLD = 0.5;
20
+ static constexpr float NMS_THRESHOLD = 0.45;
21
+ static constexpr float CONFIDENCE_THRESHOLD = 0.45;
22
+
23
+ public:
24
+ const std::vector<std::string> names = {
25
+ "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train",
26
+ "truck", "boat", "traffic light", "fire hydrant", "stop sign",
27
+ "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
28
+ "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
29
+ "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
30
+ "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
31
+ "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
32
+ "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
33
+ "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
34
+ "sofa", "potted plant", "bed", "dining table", "toilet", "tvmonitor",
35
+ "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
36
+ "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
37
+ "scissors", "teddy bear", "hair drier", "toothbrush"
38
+ };
39
+
40
+ public:
41
+ Yolo(const std::string& model);
42
+ void process(cv::Mat &img, std::vector<Item> &items);
43
+
44
+ private:
45
+ cv::dnn::Net net;
46
+ };
47
+
48
+ #endif
data/ext/yunet.cpp ADDED
@@ -0,0 +1,132 @@
1
+ #include <cmath>
2
+ #include <string>
3
+ #include <vector>
4
+ #include <numeric>
5
+ #include <algorithm>
6
+
7
+ #include <opencv2/dnn.hpp>
8
+
9
+ #include "yunet.h"
10
+
11
+
12
+ YuNet::YuNet(const std::string& model_filename)
13
+ {
14
+ net = cv::dnn::readNetFromONNX(model_filename);
15
+ net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
16
+ net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
17
+ }
18
+
19
+
20
+ void YuNet::process(const cv::Mat& img, std::vector<YuNet::Face>& faces)
21
+ {
22
+ /* -- Preparing for image size -- */
23
+ cv::Size model_size;
24
+ model_size.width = MODEL_WIDTH;
25
+ model_size.height = MODEL_WIDTH * img.rows / img.cols;
26
+ model_size.height = (model_size.height / 32) * 32;
27
+
28
+ std::pair<int32_t, int32_t> feature_map_2th = {
29
+ (model_size.height + 1) / 2 / 2,
30
+ (model_size.width + 1) / 2 / 2
31
+ };
32
+
33
+ std::vector<std::pair<int32_t, int32_t>> feature_map_list;
34
+ feature_map_list.push_back({ (feature_map_2th.first + 1) / 2 ,
35
+ (feature_map_2th.second + 1) / 2 });
36
+
37
+ for (int32_t i = 0; i < 3; i++) {
38
+ const auto& previous = feature_map_list.back();
39
+ feature_map_list.push_back({ (previous.first + 1) / 2 ,
40
+ (previous.second + 1) / 2 });
41
+ }
42
+
43
+ std::vector<std::vector<float>> prior_list;
44
+ for (int i = 0; i < static_cast<int32_t>(feature_map_list.size()); i++) {
45
+ const auto& min_sizes = MIN_SIZES[i];
46
+ const auto& feature_map = feature_map_list[i];
47
+ for (int y = 0; y < feature_map.first; y++) {
48
+ for (int x = 0; x < feature_map.second; x++) {
49
+ for (const auto& min_size : min_sizes) {
50
+ float s_kx = static_cast<float>(min_size) / model_size.width;
51
+ float s_ky = static_cast<float>(min_size) / model_size.height;
52
+ float cx = (x + 0.5f) * STEPS[i] / model_size.width;
53
+ float cy = (y + 0.5f) * STEPS[i] / model_size.height;
54
+ prior_list.push_back({ cx, cy, s_kx, s_ky });
55
+ }
56
+ }
57
+ }
58
+ }
59
+
60
+
61
+ /* -- Pre-process -- */
62
+ cv::Mat blob;
63
+ cv::dnn::blobFromImage(img, blob, 1.0, model_size);
64
+
65
+ /* -- Inference -- */
66
+ std::vector<cv::Mat> outputs;
67
+ net.setInput(blob);
68
+ net.forward(outputs, { "conf", "iou", "loc" });
69
+
70
+ /* -- Post Process -- */
71
+ const cv::Mat& mat_conf = outputs[0];
72
+ const cv::Mat& mat_iou = outputs[1];
73
+ const cv::Mat& mat_loc = outputs[2];
74
+ const cv::Size image_size = img.size();
75
+
76
+ // Get score list
77
+ std::vector<float> cls_score;
78
+ for (int32_t row = 0; row < mat_conf.rows; row++) {
79
+ float val = mat_conf.at<float>(cv::Point(1, row));
80
+ cls_score.push_back(std::clamp(val, 0.0f, 1.0f));
81
+ }
82
+
83
+ std::vector<float> iou_score;
84
+ for (int32_t row = 0; row < mat_iou.rows; row++) {
85
+ float val = mat_conf.at<float>(cv::Point(0, row));
86
+ iou_score.push_back(std::clamp(val, 0.0f, 1.0f));
87
+ }
88
+
89
+ std::vector<float> score;
90
+ for (int32_t row = 0; row < mat_conf.rows; row++) {
91
+ score.push_back(std::sqrt(cls_score[row] * iou_score[row]));
92
+ }
93
+
94
+ // All bbox
95
+ std::vector<cv::Rect> bbox_all;
96
+ for (int row = 0; row < mat_loc.rows; row++) {
97
+ float cx = mat_loc.at<float>(cv::Point(0, row));
98
+ float cy = mat_loc.at<float>(cv::Point(1, row));
99
+ float w = mat_loc.at<float>(cv::Point(2, row));
100
+ float h = mat_loc.at<float>(cv::Point(3, row));
101
+
102
+ cx = prior_list[row][0] + cx * VARIANCES[0] * prior_list[row][2];
103
+ cy = prior_list[row][1] + cy * VARIANCES[0] * prior_list[row][3];
104
+ w = prior_list[row][2] * std::exp(w * VARIANCES[0]);
105
+ h = prior_list[row][3] * std::exp(h * VARIANCES[1]);
106
+
107
+ bbox_all.push_back({
108
+ static_cast<int32_t>((cx - w / 2) * image_size.width),
109
+ static_cast<int32_t>((cy - h / 2) * image_size.height),
110
+ static_cast<int32_t>(w * image_size.width),
111
+ static_cast<int32_t>(h * image_size.height) });
112
+ }
113
+
114
+ // Non-Maximum Suppression
115
+ std::vector<int> indices;
116
+ cv::dnn::NMSBoxes(bbox_all, score, CONF_THRESHOLD, NMS_THRESHOLD, indices);
117
+
118
+ // Get valid bbox and landmark
119
+ faces.clear();
120
+ for (int idx : indices) {
121
+ Landmark landmark; // (landmark is 5 points)
122
+ for (int i = 0; i < static_cast<int>(landmark.size()); i++) {
123
+ cv::Point& p = landmark[i];
124
+ float x = mat_loc.at<float>(cv::Point(4 + i * 2, idx));
125
+ float y = mat_loc.at<float>(cv::Point(4 + i * 2 + 1, idx));
126
+ p.x = static_cast<int32_t>((prior_list[idx][0] + x * VARIANCES[0] * prior_list[idx][2]) * image_size.width);
127
+ p.y = static_cast<int32_t>((prior_list[idx][1] + y * VARIANCES[0] * prior_list[idx][3]) * image_size.height);
128
+ }
129
+
130
+ faces.push_back({ bbox_all[idx], landmark });
131
+ }
132
+ }