circe 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/circe.cpp ADDED
@@ -0,0 +1,404 @@
1
+ #include <ruby.h>
2
+
3
+ #if 0
4
+ void estimateHeadPose(cv::Mat& image, CameraModel& camera, const FaceDetection::Landmark& landmark)
5
+ {
6
+ /* reference: https://qiita.com/TaroYamada/items/e3f3d0ea4ecc0a832fac */
7
+ /* reference: https://github.com/spmallick/learnopencv/blob/master/HeadPose/headPose.cpp */
8
+ static const std::vector<cv::Point3f> face_object_point_list = {
9
+ { 0.0f, 0.0f, 0.0f }, // Nose
10
+ { -225.0f, 170.0f, -135.0f }, // Left eye
11
+ { 225.0f, 170.0f, -135.0f }, // Right eye
12
+ { -150.0f, -150.0f, -125.0f }, // Left lip
13
+ { 150.0f, -150.0f, -125.0f }, // Right lip
14
+ };
15
+
16
+ static const std::vector<cv::Point3f> face_object_point_for_pnp_list = {
17
+ face_object_point_list[0], face_object_point_list[0], // Nose
18
+ face_object_point_list[1], face_object_point_list[1], // Left eye
19
+ face_object_point_list[2], face_object_point_list[2], // Right eye
20
+ face_object_point_list[3], face_object_point_list[3], // Left lip
21
+ face_object_point_list[4], face_object_point_list[4], // Righ lip
22
+ };
23
+
24
+ std::vector<cv::Point2f> face_image_point_list;
25
+ face_image_point_list.push_back(landmark[2]); // Nose
26
+ face_image_point_list.push_back(landmark[2]); // Nose
27
+ face_image_point_list.push_back(landmark[0]); // Left eye
28
+ face_image_point_list.push_back(landmark[0]); // Left eye
29
+ face_image_point_list.push_back(landmark[1]); // Right eye
30
+ face_image_point_list.push_back(landmark[1]); // Right eye
31
+ face_image_point_list.push_back(landmark[3]); // Left lip
32
+ face_image_point_list.push_back(landmark[3]); // Left lip
33
+ face_image_point_list.push_back(landmark[4]); // Right lip
34
+ face_image_point_list.push_back(landmark[4]); // Right lip
35
+
36
+ cv::Mat rvec = cv::Mat_<float>(3, 1);
37
+ cv::Mat tvec = cv::Mat_<float>(3, 1);
38
+ cv::solvePnP(face_object_point_for_pnp_list, face_image_point_list, camera.K, camera.dist_coeff, rvec, tvec, false, cv::SOLVEPNP_ITERATIVE);
39
+
40
+ char text[128];
41
+ snprintf(text, sizeof(text), "Pitch = %-+4.0f, Yaw = %-+4.0f, Roll = %-+4.0f", Rad2Deg(rvec.at<float>(0, 0)), Rad2Deg(rvec.at<float>(1, 0)), Rad2Deg(rvec.at<float>(2, 0)));
42
+ CommonHelper::drawText(image, text, cv::Point(10, 10), 0.7, 3, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255), false);
43
+
44
+ std::vector<cv::Point3f> nose_end_point3D = { { 0.0f, 0.0f, 500.0f } };
45
+ std::vector<cv::Point2f> nose_end_point2D;
46
+ cv::projectPoints(nose_end_point3D, rvec, tvec,
47
+ camera.K, camera.dist_coeff, nose_end_point2D);
48
+ cv::arrowedLine(image, face_image_point_list[0], nose_end_point2D[0],
49
+ cv::Scalar(0, 255, 0), 5);
50
+
51
+
52
+ /* Calculate Euler Angle */
53
+ cv::Mat R;
54
+ cv::Rodrigues(rvec, R);
55
+ cv::Mat projMat = (cv::Mat_<double>(3, 4) <<
56
+ R.at<float>(0, 0), R.at<float>(0, 1), R.at<float>(0, 2), 0,
57
+ R.at<float>(1, 0), R.at<float>(1, 1), R.at<float>(1, 2), 0,
58
+ R.at<float>(2, 0), R.at<float>(2, 1), R.at<float>(2, 2), 0);
59
+
60
+ cv::Mat K, R2, tvec2, R_x, R_y, R_z, eulerAngles;
61
+ cv::decomposeProjectionMatrix(projMat, K, R, tvec2,
62
+ R_x, R_y, R_z, eulerAngles);
63
+ double pitch = eulerAngles.at<double>(0, 0);
64
+ double yaw = eulerAngles.at<double>(1, 0);
65
+ double roll = eulerAngles.at<double>(2, 0);
66
+ snprintf(text, sizeof(text), "X = %-+4.0f, Y = %-+4.0f, Z = %-+4.0f", pitch, yaw, roll);
67
+ CommonHelper::drawText(image, text, cv::Point(10, 40), 0.7, 3, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255), false);
68
+ }
69
+ #endif
70
+
71
+ #if 0
72
+ CameraModel camera;
73
+ camera.SetIntrinsic(image_input.cols, image_input.rows, FocalLength(image_input.cols, kFovDeg));
74
+ camera.SetDist({ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f });
75
+ camera.SetExtrinsic({ 0.0f, 0.0f, 0.0f }, /* rvec [deg] */
76
+ { 0.0f, 0.0f, 0.0f }, true); /* tvec (in world coordinate) */
77
+
78
+ estimateHeadPose(image_input, camera, face_list[i].second);
79
+
80
+ #endif
81
+
82
+ #define IF_UNDEF(a, b) \
83
+ ((a) == Qundef) ? (b) : (a)
84
+
85
+
86
+ static VALUE cCirce = Qundef;
87
+ static VALUE eCirceError = Qundef;
88
+
89
+ #include <tuple>
90
+ #include <string>
91
+ #include <chrono>
92
+ #include <opencv2/highgui.hpp>
93
+ #include <opencv2/imgproc.hpp>
94
+
95
+ #include "yolo.h"
96
+ #include "yunet.h"
97
+
98
+ using namespace std;
99
+ using namespace cv;
100
+
101
+
102
+
103
+ // Text parameters.
104
+ const float FONT_SCALE = 0.7;
105
+ const int FONT_FACE = FONT_HERSHEY_SIMPLEX;
106
+ const int THICKNESS = 1;
107
+
108
+ // Colors.
109
+ cv::Scalar BLACK = cv::Scalar(0,0,0);
110
+ cv::Scalar BLUE = cv::Scalar(255, 178, 50);
111
+ cv::Scalar YELLOW = cv::Scalar(0, 255, 255);
112
+ cv::Scalar RED = cv::Scalar(0,0,255);
113
+
114
+
115
+ static ID id_debug;
116
+ static ID id_face;
117
+ static ID id_classify;
118
+ static ID id_class;
119
+ static ID id_png;
120
+ static ID id_jpg;
121
+
122
+
123
+ static Yolo *yolo;
124
+ static YuNet *yunet;
125
+
126
+
127
+
128
+ static void
129
+ draw_label(cv::Mat& img, string label, Point& origin,
130
+ Scalar& fgcolor, Scalar& bgcolor, int thickness = 1)
131
+ {
132
+ int baseLine;
133
+ cv::Size label_size =
134
+ cv::getTextSize(label, FONT_FACE, FONT_SCALE, thickness, &baseLine);
135
+
136
+ Point a = { origin.x, max(origin.y, label_size.height) };
137
+ Point b = { a.x + label_size.width,
138
+ a.y + label_size.height + baseLine };
139
+ Point t = { a.x, a.y + label_size.height };
140
+
141
+ cv::rectangle(img, a, b, bgcolor, cv::FILLED);
142
+ cv::putText(img, label, t, FONT_FACE, FONT_SCALE, fgcolor, THICKNESS);
143
+ }
144
+
145
+ static void
146
+ draw_labelbox(cv::Mat& img, string label, Rect& box,
147
+ Scalar& framecolor = BLUE, Scalar& textcolor = BLACK,
148
+ int thickness = 1) {
149
+
150
+ Point a = { box.x, box.y };
151
+ Point b = { box.x + box.width, box.y + box.height };
152
+
153
+ cv::rectangle(img, a, b, framecolor, thickness);
154
+ draw_label(img, label, a, textcolor, framecolor);
155
+ }
156
+
157
+
158
+
159
+
160
+
161
+ void
162
+ yunet_process_features(vector<YuNet::Face>& faces,
163
+ Mat& img, VALUE v_features, int *state)
164
+ {
165
+ for (int i = 0; i < faces.size(); i++) {
166
+ Rect box = faces[i].first;
167
+ YuNet::Landmark lmark = faces[i].second;
168
+
169
+ VALUE v_type = ID2SYM(id_face);
170
+ VALUE v_box = rb_ary_new_from_args(4,
171
+ INT2NUM(box.x ), INT2NUM(box.y ),
172
+ INT2NUM(box.width), INT2NUM(box.height));
173
+ VALUE v_landmark = rb_ary_new_from_args(5,
174
+ rb_ary_new_from_args(2, INT2NUM(lmark[0].x),
175
+ INT2NUM(lmark[0].y)),
176
+ rb_ary_new_from_args(2, INT2NUM(lmark[1].x),
177
+ INT2NUM(lmark[1].y)),
178
+ rb_ary_new_from_args(2, INT2NUM(lmark[2].x),
179
+ INT2NUM(lmark[2].y)),
180
+ rb_ary_new_from_args(2, INT2NUM(lmark[3].x),
181
+ INT2NUM(lmark[3].y)),
182
+ rb_ary_new_from_args(2, INT2NUM(lmark[4].x),
183
+ INT2NUM(lmark[4].y)));
184
+ VALUE v_feature = rb_ary_new_from_args(3, v_type, v_box,
185
+ v_landmark);
186
+ rb_ary_push(v_features, v_feature);
187
+
188
+ if (!img.empty() && rb_block_given_p()) {
189
+ VALUE v_annotation= rb_yield_splat(v_feature);
190
+ VALUE v_label = Qnil;
191
+ VALUE v_color = ULONG2NUM(0x0000ff);
192
+ VALUE v_thickness = INT2NUM(1);
193
+
194
+ switch (TYPE(v_annotation)) {
195
+ case T_NIL:
196
+ break;
197
+ case T_HASH:
198
+ break;
199
+ case T_ARRAY:
200
+ switch(RARRAY_LENINT(v_annotation)) {
201
+ default:
202
+ case 3: v_thickness = RARRAY_AREF(v_annotation, 2);
203
+ case 2: v_color = RARRAY_AREF(v_annotation, 1);
204
+ case 1: v_label = RARRAY_AREF(v_annotation, 0);
205
+ case 0: break;
206
+ }
207
+ break;
208
+ case T_STRING:
209
+ v_label = v_annotation;
210
+ break;
211
+ }
212
+
213
+ if (! NIL_P(v_label)) {
214
+ string label = StringValueCStr(v_label);
215
+ long rgb = NUM2ULONG(v_color);
216
+ int thickness = NUM2INT(v_thickness);
217
+ Scalar color = cv::Scalar((rgb >> 0) & 0xFF,
218
+ (rgb >> 8) & 0xFF,
219
+ (rgb >> 16) & 0xFF);
220
+ draw_labelbox(img, label, box, color, BLACK, thickness);
221
+
222
+ for (const auto& p : lmark) {
223
+ cv::circle(img, p, 3, cv::Scalar(255, 0, 0), 2);
224
+ }
225
+ }
226
+ }
227
+ }
228
+ }
229
+
230
+
231
+
232
+ void
233
+ yolo_process_features(vector<Yolo::Item>& items,
234
+ Mat& img, VALUE v_features, int *state)
235
+ {
236
+ for (int i = 0; i < items.size(); i++) {
237
+ string name = std::get<0>(items[i]);
238
+ float confidence = std::get<1>(items[i]);
239
+ Rect box = std::get<2>(items[i]);
240
+
241
+ VALUE v_type = ID2SYM(id_class);
242
+ VALUE v_name = rb_str_new(name.c_str(), name.size());
243
+ VALUE v_confidence = DBL2NUM(confidence);
244
+ VALUE v_box = rb_ary_new_from_args(4,
245
+ INT2NUM(box.x ), INT2NUM(box.y ),
246
+ INT2NUM(box.width), INT2NUM(box.height));
247
+ VALUE v_feature = rb_ary_new_from_args(4, v_type, v_box,
248
+ v_name, v_confidence);
249
+ rb_ary_push(v_features, v_feature);
250
+
251
+ if (rb_block_given_p()) {
252
+ VALUE v_annotation = rb_yield_splat(v_feature);
253
+
254
+ VALUE v_label = Qnil;
255
+ VALUE v_color = ULONG2NUM(0x0000ff);
256
+ VALUE v_thickness = INT2NUM(1);
257
+
258
+ switch (TYPE(v_annotation)) {
259
+ case T_NIL:
260
+ break;
261
+ case T_HASH:
262
+ break;
263
+ case T_ARRAY:
264
+ switch(RARRAY_LENINT(v_annotation)) {
265
+ default:
266
+ case 3: v_thickness = RARRAY_AREF(v_annotation, 2);
267
+ case 2: v_color = RARRAY_AREF(v_annotation, 1);
268
+ case 1: v_label = RARRAY_AREF(v_annotation, 0);
269
+ case 0: break;
270
+ }
271
+ break;
272
+ case T_STRING:
273
+ v_label = v_annotation;
274
+ break;
275
+ }
276
+
277
+ if (! NIL_P(v_label)) {
278
+ string label = StringValueCStr(v_label);
279
+ long rgb = NUM2ULONG(v_color);
280
+ int thickness = NUM2INT(v_thickness);
281
+ Scalar color = cv::Scalar((rgb >> 0) & 0xFF,
282
+ (rgb >> 8) & 0xFF,
283
+ (rgb >> 16) & 0xFF);
284
+ draw_labelbox(img, label, box, color, BLACK, thickness);
285
+ }
286
+ }
287
+ }
288
+ }
289
+
290
+
291
+
292
+ static VALUE
293
+ circe_m_analyze(int argc, VALUE* argv, VALUE self) {
294
+ // Retrieve arguments
295
+ VALUE v_imgstr, v_format, v_opts;
296
+ VALUE kwargs[3];
297
+ rb_scan_args(argc, argv, "11:", &v_imgstr, &v_format, &v_opts);
298
+ rb_get_kwargs(v_opts, (ID[]){ id_debug, id_face, id_classify },
299
+ 0, 3, kwargs);
300
+ VALUE v_debug = IF_UNDEF(kwargs[0], Qfalse);
301
+ VALUE v_face = IF_UNDEF(kwargs[1], Qfalse);
302
+ VALUE v_classify = IF_UNDEF(kwargs[2], Qfalse);
303
+
304
+ VALUE v_features = rb_ary_new();
305
+ VALUE v_image = Qnil;
306
+
307
+ if (! NIL_P(v_format)) {
308
+ Check_Type(v_format, T_SYMBOL);
309
+ ID i_format = rb_sym2id(v_format);
310
+ if ((i_format != id_png) && (i_format != id_jpg))
311
+ rb_raise(rb_eArgError, "format must be :png, :jpg or nil");
312
+ }
313
+
314
+ if (!RTEST(v_face) && !RTEST(v_classify)) {
315
+ v_face = v_classify = Qtrue;
316
+ }
317
+
318
+
319
+ // Load image.
320
+ Mat i_img = cv::imdecode(cv::Mat(1, RSTRING_LEN(v_imgstr), CV_8UC1,
321
+ (unsigned char *)RSTRING_PTR(v_imgstr)),
322
+ IMREAD_UNCHANGED);
323
+ Mat o_img = NIL_P(v_format) ? cv::Mat() : i_img.clone();
324
+
325
+ // Processing
326
+ std::chrono::time_point<std::chrono::system_clock> start_time, end_time;
327
+ std::chrono::duration<double> duration;
328
+ int state = 0;
329
+
330
+ start_time = std::chrono::system_clock::now();
331
+
332
+ if (RTEST(v_classify)) {
333
+ vector<Yolo::Item> items;
334
+ yolo->process(i_img, items);
335
+ yolo_process_features(items, o_img, v_features, &state);
336
+ if (state) goto exception;
337
+ }
338
+
339
+ if (RTEST(v_face)) {
340
+ vector<YuNet::Face> faces;
341
+ yunet->process(i_img, faces);
342
+ yunet_process_features(faces, o_img, v_features, &state);
343
+ if (state) goto exception;
344
+ }
345
+
346
+ end_time = std::chrono::system_clock::now();
347
+ duration = end_time - start_time;
348
+
349
+
350
+ if (! NIL_P(v_format)) {
351
+ if (RTEST(v_debug)) {
352
+ double ms = duration / 1.0ms;
353
+ string label = cv::format("Inference time : %0.2f ms", ms);
354
+ cv::putText(o_img, label, Point(20, 40),
355
+ FONT_FACE, FONT_SCALE, RED);
356
+ }
357
+
358
+ ID i_format = rb_sym2id(v_format);
359
+ string format;
360
+ if (i_format == id_png) { format = ".png"; }
361
+ else if (i_format == id_jpg) { format = ".jpg"; }
362
+
363
+ std::vector<uchar> buf;
364
+ cv::imencode(format, o_img, buf);
365
+ v_image = rb_str_new(reinterpret_cast<char*>(buf.data()), buf.size());
366
+ }
367
+ return rb_ary_new_from_args(2, v_features, v_image);
368
+
369
+ exception:
370
+ i_img.release();
371
+ o_img.release();
372
+ rb_jump_tag(state);
373
+ }
374
+
375
+
376
+
377
+
378
+ extern "C"
379
+ void Init_core(void) {
380
+ /* Main classes */
381
+ cCirce = rb_define_class("Circe", rb_cObject);
382
+ eCirceError = rb_define_class_under(cCirce, "Error", rb_eStandardError);
383
+ // myclass = rb_const_get(mymodule, sym_myclass);
384
+
385
+ VALUE v_onnx_yolo = rb_const_get(cCirce, rb_intern("ONNX_YOLO"));
386
+ VALUE v_onnx_yunet = rb_const_get(cCirce, rb_intern("ONNX_YUNET"));
387
+
388
+ static Yolo _yolo = { StringValueCStr(v_onnx_yolo ) };
389
+ static YuNet _yunet = { StringValueCStr(v_onnx_yunet) };
390
+
391
+ yolo = &_yolo;
392
+ yunet = &_yunet;
393
+
394
+
395
+ id_debug = rb_intern_const("debug" );
396
+ id_face = rb_intern_const("face" );
397
+ id_classify = rb_intern_const("classify");
398
+ id_class = rb_intern_const("class" );
399
+ id_png = rb_intern_const("png" );
400
+ id_jpg = rb_intern_const("jpg" );
401
+
402
+
403
+ rb_define_method(cCirce, "analyze", circe_m_analyze, -1);
404
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'mkmf'
2
+
3
+ $PKGCONFIG =
4
+ case RbConfig::CONFIG['host_os']
5
+ when /bsd/ then '/usr/local/bin/pkgconf'
6
+ end
7
+
8
+
9
+ cflags, ldflags, libs = pkg_config('opencv4')
10
+
11
+ $LDFLAGS += " #{ldflags} #{libs}"
12
+ $INCFLAGS += " #{cflags}"
13
+ $CXXFLAGS += "-std=c++17"
14
+
15
+ create_makefile("circe/core")
data/ext/yolo.cpp ADDED
@@ -0,0 +1,94 @@
1
+ #include <tuple>
2
+ #include <string>
3
+ #include <opencv2/imgproc.hpp>
4
+ #include <opencv2/dnn.hpp>
5
+
6
+ #include "yolo.h"
7
+
8
+
9
+ Yolo::Yolo(const std::string& model) {
10
+ net = cv::dnn::readNetFromONNX(model);
11
+ }
12
+
13
+
14
+ void Yolo::process(cv::Mat &img, std::vector<Yolo::Item> &items) {
15
+
16
+ // Pre-process
17
+ cv::Mat blob;
18
+ std::vector<cv::Mat> outputs;
19
+
20
+ cv::dnn::blobFromImage(img, blob, 1./255.,
21
+ cv::Size(INPUT_WIDTH, INPUT_HEIGHT),
22
+ cv::Scalar(), true, false);
23
+
24
+ // Process
25
+ net.setInput(blob);
26
+ net.forward(outputs, net.getUnconnectedOutLayersNames());
27
+
28
+ // Post-process
29
+ std::vector<int> class_ids;
30
+ std::vector<float> confidences;
31
+ std::vector<cv::Rect> boxes;
32
+
33
+ // Resizing factor.
34
+ float x_factor = img.cols / INPUT_WIDTH;
35
+ float y_factor = img.rows / INPUT_HEIGHT;
36
+
37
+ float *data = (float *)outputs[0].data;
38
+ const int dimensions = 85;
39
+
40
+ // 25200 for default size 640.
41
+ const int rows = 25200;
42
+ // Iterate through 25200 detections.
43
+ for (int i = 0; i < rows; ++i) {
44
+ float confidence = data[4];
45
+
46
+ // Discard bad detections and continue.
47
+ if (confidence >= CONFIDENCE_THRESHOLD) {
48
+ float *classes_scores = data + 5;
49
+
50
+ // Create a 1x85 Mat and store class scores of 80 classes.
51
+ cv::Mat scores(1, names.size(), CV_32FC1, classes_scores);
52
+
53
+ // Perform minMaxLoc and acquire the index of best class score.
54
+ cv::Point class_id;
55
+ double max_class_score;
56
+ minMaxLoc(scores, 0, &max_class_score, 0, &class_id);
57
+
58
+ // Continue if the class score is above the threshold.
59
+ if (max_class_score > SCORE_THRESHOLD) {
60
+ // Store class ID and confidence in the pre-defined
61
+ // respective vectors.
62
+ float cx = data[0]; // Center
63
+ float cy = data[1];
64
+ float w = data[2]; // Box dimension
65
+ float h = data[3];
66
+
67
+ // Bounding box coordinates.
68
+ int left = int((cx - 0.5 * w) * x_factor);
69
+ int top = int((cy - 0.5 * h) * y_factor);
70
+ int width = int(w * x_factor);
71
+ int height = int(h * y_factor);
72
+
73
+ // Store good detections in the boxes vector.
74
+ confidences.push_back(confidence);
75
+ class_ids.push_back(class_id.x);
76
+ boxes.push_back(cv::Rect(left, top, width, height));
77
+ }
78
+ }
79
+ // Jump to the next row.
80
+ data += 85;
81
+ }
82
+
83
+ // Perform Non-Maximum Suppression and draw predictions.
84
+ std::vector<int> indices;
85
+ cv::dnn::NMSBoxes(boxes, confidences,
86
+ SCORE_THRESHOLD, NMS_THRESHOLD, indices);
87
+
88
+ items.clear();
89
+
90
+ for (int i = 0; i < indices.size(); i++) {
91
+ int idx = indices[i];
92
+ items.push_back({ names[class_ids[idx]],confidences[idx],boxes[idx] });
93
+ }
94
+ }
data/ext/yolo.h ADDED
@@ -0,0 +1,48 @@
1
+ #ifndef __YOLO__
2
+ #define __YOLO__
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <tuple>
7
+
8
+ #include <opencv2/dnn.hpp>
9
+
10
+
11
+ class Yolo
12
+ {
13
+ public:
14
+ typedef std::tuple<std::string, float, cv::Rect> Item;
15
+
16
+ private:
17
+ static constexpr float INPUT_WIDTH = 640.0;
18
+ static constexpr float INPUT_HEIGHT = 640.0;
19
+ static constexpr float SCORE_THRESHOLD = 0.5;
20
+ static constexpr float NMS_THRESHOLD = 0.45;
21
+ static constexpr float CONFIDENCE_THRESHOLD = 0.45;
22
+
23
+ public:
24
+ const std::vector<std::string> names = {
25
+ "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train",
26
+ "truck", "boat", "traffic light", "fire hydrant", "stop sign",
27
+ "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
28
+ "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
29
+ "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
30
+ "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
31
+ "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
32
+ "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
33
+ "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
34
+ "sofa", "potted plant", "bed", "dining table", "toilet", "tvmonitor",
35
+ "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
36
+ "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
37
+ "scissors", "teddy bear", "hair drier", "toothbrush"
38
+ };
39
+
40
+ public:
41
+ Yolo(const std::string& model);
42
+ void process(cv::Mat &img, std::vector<Item> &items);
43
+
44
+ private:
45
+ cv::dnn::Net net;
46
+ };
47
+
48
+ #endif
data/ext/yunet.cpp ADDED
@@ -0,0 +1,132 @@
1
+ #include <cmath>
2
+ #include <string>
3
+ #include <vector>
4
+ #include <numeric>
5
+ #include <algorithm>
6
+
7
+ #include <opencv2/dnn.hpp>
8
+
9
+ #include "yunet.h"
10
+
11
+
12
+ YuNet::YuNet(const std::string& model_filename)
13
+ {
14
+ net = cv::dnn::readNetFromONNX(model_filename);
15
+ net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
16
+ net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
17
+ }
18
+
19
+
20
+ void YuNet::process(const cv::Mat& img, std::vector<YuNet::Face>& faces)
21
+ {
22
+ /* -- Preparing for image size -- */
23
+ cv::Size model_size;
24
+ model_size.width = MODEL_WIDTH;
25
+ model_size.height = MODEL_WIDTH * img.rows / img.cols;
26
+ model_size.height = (model_size.height / 32) * 32;
27
+
28
+ std::pair<int32_t, int32_t> feature_map_2th = {
29
+ (model_size.height + 1) / 2 / 2,
30
+ (model_size.width + 1) / 2 / 2
31
+ };
32
+
33
+ std::vector<std::pair<int32_t, int32_t>> feature_map_list;
34
+ feature_map_list.push_back({ (feature_map_2th.first + 1) / 2 ,
35
+ (feature_map_2th.second + 1) / 2 });
36
+
37
+ for (int32_t i = 0; i < 3; i++) {
38
+ const auto& previous = feature_map_list.back();
39
+ feature_map_list.push_back({ (previous.first + 1) / 2 ,
40
+ (previous.second + 1) / 2 });
41
+ }
42
+
43
+ std::vector<std::vector<float>> prior_list;
44
+ for (int i = 0; i < static_cast<int32_t>(feature_map_list.size()); i++) {
45
+ const auto& min_sizes = MIN_SIZES[i];
46
+ const auto& feature_map = feature_map_list[i];
47
+ for (int y = 0; y < feature_map.first; y++) {
48
+ for (int x = 0; x < feature_map.second; x++) {
49
+ for (const auto& min_size : min_sizes) {
50
+ float s_kx = static_cast<float>(min_size) / model_size.width;
51
+ float s_ky = static_cast<float>(min_size) / model_size.height;
52
+ float cx = (x + 0.5f) * STEPS[i] / model_size.width;
53
+ float cy = (y + 0.5f) * STEPS[i] / model_size.height;
54
+ prior_list.push_back({ cx, cy, s_kx, s_ky });
55
+ }
56
+ }
57
+ }
58
+ }
59
+
60
+
61
+ /* -- Pre-process -- */
62
+ cv::Mat blob;
63
+ cv::dnn::blobFromImage(img, blob, 1.0, model_size);
64
+
65
+ /* -- Inference -- */
66
+ std::vector<cv::Mat> outputs;
67
+ net.setInput(blob);
68
+ net.forward(outputs, { "conf", "iou", "loc" });
69
+
70
+ /* -- Post Process -- */
71
+ const cv::Mat& mat_conf = outputs[0];
72
+ const cv::Mat& mat_iou = outputs[1];
73
+ const cv::Mat& mat_loc = outputs[2];
74
+ const cv::Size image_size = img.size();
75
+
76
+ // Get score list
77
+ std::vector<float> cls_score;
78
+ for (int32_t row = 0; row < mat_conf.rows; row++) {
79
+ float val = mat_conf.at<float>(cv::Point(1, row));
80
+ cls_score.push_back(std::clamp(val, 0.0f, 1.0f));
81
+ }
82
+
83
+ std::vector<float> iou_score;
84
+ for (int32_t row = 0; row < mat_iou.rows; row++) {
85
+ float val = mat_conf.at<float>(cv::Point(0, row));
86
+ iou_score.push_back(std::clamp(val, 0.0f, 1.0f));
87
+ }
88
+
89
+ std::vector<float> score;
90
+ for (int32_t row = 0; row < mat_conf.rows; row++) {
91
+ score.push_back(std::sqrt(cls_score[row] * iou_score[row]));
92
+ }
93
+
94
+ // All bbox
95
+ std::vector<cv::Rect> bbox_all;
96
+ for (int row = 0; row < mat_loc.rows; row++) {
97
+ float cx = mat_loc.at<float>(cv::Point(0, row));
98
+ float cy = mat_loc.at<float>(cv::Point(1, row));
99
+ float w = mat_loc.at<float>(cv::Point(2, row));
100
+ float h = mat_loc.at<float>(cv::Point(3, row));
101
+
102
+ cx = prior_list[row][0] + cx * VARIANCES[0] * prior_list[row][2];
103
+ cy = prior_list[row][1] + cy * VARIANCES[0] * prior_list[row][3];
104
+ w = prior_list[row][2] * std::exp(w * VARIANCES[0]);
105
+ h = prior_list[row][3] * std::exp(h * VARIANCES[1]);
106
+
107
+ bbox_all.push_back({
108
+ static_cast<int32_t>((cx - w / 2) * image_size.width),
109
+ static_cast<int32_t>((cy - h / 2) * image_size.height),
110
+ static_cast<int32_t>(w * image_size.width),
111
+ static_cast<int32_t>(h * image_size.height) });
112
+ }
113
+
114
+ // Non-Maximum Suppression
115
+ std::vector<int> indices;
116
+ cv::dnn::NMSBoxes(bbox_all, score, CONF_THRESHOLD, NMS_THRESHOLD, indices);
117
+
118
+ // Get valid bbox and landmark
119
+ faces.clear();
120
+ for (int idx : indices) {
121
+ Landmark landmark; // (landmark is 5 points)
122
+ for (int i = 0; i < static_cast<int>(landmark.size()); i++) {
123
+ cv::Point& p = landmark[i];
124
+ float x = mat_loc.at<float>(cv::Point(4 + i * 2, idx));
125
+ float y = mat_loc.at<float>(cv::Point(4 + i * 2 + 1, idx));
126
+ p.x = static_cast<int32_t>((prior_list[idx][0] + x * VARIANCES[0] * prior_list[idx][2]) * image_size.width);
127
+ p.y = static_cast<int32_t>((prior_list[idx][1] + y * VARIANCES[0] * prior_list[idx][3]) * image_size.height);
128
+ }
129
+
130
+ faces.push_back({ bbox_all[idx], landmark });
131
+ }
132
+ }