imagecore 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data/.gitignore +24 -0
  2. data/Gemfile +4 -0
  3. data/Rakefile +2 -0
  4. data/ext/imagecore/analyze_image.cxx +58 -0
  5. data/ext/imagecore/analyze_image.h +6 -0
  6. data/ext/imagecore/extconf.rb +9 -0
  7. data/ext/imagecore/imagecore.cxx +34 -0
  8. data/ext/opencv/core/___.c +3 -0
  9. data/ext/opencv/core/alloc.cpp +697 -0
  10. data/ext/opencv/core/array.cpp +3206 -0
  11. data/ext/opencv/core/datastructs.cpp +4064 -0
  12. data/ext/opencv/core/extconf.rb +22 -0
  13. data/ext/opencv/core/matrix.cpp +3777 -0
  14. data/ext/opencv/core/precomp.hpp +216 -0
  15. data/ext/opencv/core/system.cpp +832 -0
  16. data/ext/opencv/core/tables.cpp +3512 -0
  17. data/ext/opencv/highgui/___.c +3 -0
  18. data/ext/opencv/highgui/bitstrm.cpp +582 -0
  19. data/ext/opencv/highgui/bitstrm.hpp +182 -0
  20. data/ext/opencv/highgui/extconf.rb +28 -0
  21. data/ext/opencv/highgui/grfmt_base.cpp +128 -0
  22. data/ext/opencv/highgui/grfmt_base.hpp +113 -0
  23. data/ext/opencv/highgui/grfmt_bmp.cpp +564 -0
  24. data/ext/opencv/highgui/grfmt_bmp.hpp +99 -0
  25. data/ext/opencv/highgui/grfmt_exr.hpp +113 -0
  26. data/ext/opencv/highgui/grfmt_imageio.hpp +56 -0
  27. data/ext/opencv/highgui/grfmt_jpeg.cpp +622 -0
  28. data/ext/opencv/highgui/grfmt_jpeg.hpp +90 -0
  29. data/ext/opencv/highgui/grfmt_jpeg2000.cpp +529 -0
  30. data/ext/opencv/highgui/grfmt_jpeg2000.hpp +95 -0
  31. data/ext/opencv/highgui/grfmt_png.cpp +406 -0
  32. data/ext/opencv/highgui/grfmt_png.hpp +101 -0
  33. data/ext/opencv/highgui/grfmt_pxm.cpp +513 -0
  34. data/ext/opencv/highgui/grfmt_pxm.hpp +92 -0
  35. data/ext/opencv/highgui/grfmt_sunras.cpp +425 -0
  36. data/ext/opencv/highgui/grfmt_sunras.hpp +105 -0
  37. data/ext/opencv/highgui/grfmt_tiff.cpp +718 -0
  38. data/ext/opencv/highgui/grfmt_tiff.hpp +136 -0
  39. data/ext/opencv/highgui/grfmts.hpp +56 -0
  40. data/ext/opencv/highgui/loadsave.cpp +535 -0
  41. data/ext/opencv/highgui/precomp.hpp +223 -0
  42. data/ext/opencv/highgui/utils.cpp +689 -0
  43. data/ext/opencv/highgui/utils.hpp +128 -0
  44. data/ext/opencv/imgproc/___.c +3 -0
  45. data/ext/opencv/imgproc/_geom.h +72 -0
  46. data/ext/opencv/imgproc/color.cpp +3179 -0
  47. data/ext/opencv/imgproc/contours.cpp +1780 -0
  48. data/ext/opencv/imgproc/extconf.rb +11 -0
  49. data/ext/opencv/imgproc/filter.cpp +3063 -0
  50. data/ext/opencv/imgproc/precomp.hpp +159 -0
  51. data/ext/opencv/imgproc/shapedescr.cpp +1306 -0
  52. data/ext/opencv/imgproc/smooth.cpp +1566 -0
  53. data/ext/opencv/imgproc/tables.cpp +214 -0
  54. data/ext/opencv/imgproc/thresh.cpp +636 -0
  55. data/ext/opencv/imgproc/utils.cpp +242 -0
  56. data/ext/opencv/include/opencv2/core/core.hpp +4344 -0
  57. data/ext/opencv/include/opencv2/core/core_c.h +1885 -0
  58. data/ext/opencv/include/opencv2/core/internal.hpp +710 -0
  59. data/ext/opencv/include/opencv2/core/mat.hpp +2557 -0
  60. data/ext/opencv/include/opencv2/core/operations.hpp +3623 -0
  61. data/ext/opencv/include/opencv2/core/types_c.h +1875 -0
  62. data/ext/opencv/include/opencv2/core/version.hpp +58 -0
  63. data/ext/opencv/include/opencv2/highgui/highgui.hpp +198 -0
  64. data/ext/opencv/include/opencv2/highgui/highgui_c.h +506 -0
  65. data/ext/opencv/include/opencv2/imgproc/imgproc.hpp +1139 -0
  66. data/ext/opencv/include/opencv2/imgproc/imgproc_c.h +783 -0
  67. data/ext/opencv/include/opencv2/imgproc/types_c.h +538 -0
  68. data/imagecore.gemspec +20 -0
  69. data/lib/imagecore.rb +16 -0
  70. data/lib/imagecore/version.rb +3 -0
  71. metadata +119 -0
@@ -0,0 +1,11 @@
1
+ require 'mkmf'
2
+
3
+ # not valid for C++ code
4
+ $warnflags = ($warnflags.to_s.split - %w(-Wdeclaration-after-statement -Wimplicit-function-declaration)) * ' '
5
+
6
+ # OpenCV includes
7
+ $INCFLAGS << ' -I ../include'
8
+
9
+ create_header('cvconfig.h')
10
+
11
+ create_makefile("opencv_imgproc")
@@ -0,0 +1,3063 @@
1
+ /*M///////////////////////////////////////////////////////////////////////////////////////
2
+ //
3
+ // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
+ //
5
+ // By downloading, copying, installing or using the software you agree to this license.
6
+ // If you do not agree to this license, do not download, install,
7
+ // copy or use the software.
8
+ //
9
+ //
10
+ // License Agreement
11
+ // For Open Source Computer Vision Library
12
+ //
13
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
+ // Third party copyrights are property of their respective owners.
16
+ //
17
+ // Redistribution and use in source and binary forms, with or without modification,
18
+ // are permitted provided that the following conditions are met:
19
+ //
20
+ // * Redistribution's of source code must retain the above copyright notice,
21
+ // this list of conditions and the following disclaimer.
22
+ //
23
+ // * Redistribution's in binary form must reproduce the above copyright notice,
24
+ // this list of conditions and the following disclaimer in the documentation
25
+ // and/or other materials provided with the distribution.
26
+ //
27
+ // * The name of the copyright holders may not be used to endorse or promote products
28
+ // derived from this software without specific prior written permission.
29
+ //
30
+ // This software is provided by the copyright holders and contributors "as is" and
31
+ // any express or implied warranties, including, but not limited to, the implied
32
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
33
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
34
+ // indirect, incidental, special, exemplary, or consequential damages
35
+ // (including, but not limited to, procurement of substitute goods or services;
36
+ // loss of use, data, or profits; or business interruption) however caused
37
+ // and on any theory of liability, whether in contract, strict liability,
38
+ // or tort (including negligence or otherwise) arising in any way out of
39
+ // the use of this software, even if advised of the possibility of such damage.
40
+ //
41
+ //M*/
42
+
43
+ #include "precomp.hpp"
44
+
45
+ /****************************************************************************************\
46
+ Base Image Filter
47
+ \****************************************************************************************/
48
+
49
+ /*
50
+ Various border types, image boundaries are denoted with '|'
51
+
52
+ * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
53
+ * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
54
+ * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
55
+ * BORDER_WRAP: cdefgh|abcdefgh|abcdefg
56
+ * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i'
57
+ */
58
+ int cv::borderInterpolate( int p, int len, int borderType )
59
+ {
60
+ if( (unsigned)p < (unsigned)len )
61
+ ;
62
+ else if( borderType == BORDER_REPLICATE )
63
+ p = p < 0 ? 0 : len - 1;
64
+ else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
65
+ {
66
+ int delta = borderType == BORDER_REFLECT_101;
67
+ if( len == 1 )
68
+ return 0;
69
+ do
70
+ {
71
+ if( p < 0 )
72
+ p = -p - 1 + delta;
73
+ else
74
+ p = len - 1 - (p - len) - delta;
75
+ }
76
+ while( (unsigned)p >= (unsigned)len );
77
+ }
78
+ else if( borderType == BORDER_WRAP )
79
+ {
80
+ if( p < 0 )
81
+ p -= ((p-len+1)/len)*len;
82
+ if( p >= len )
83
+ p %= len;
84
+ }
85
+ else if( borderType == BORDER_CONSTANT )
86
+ p = -1;
87
+ else
88
+ CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
89
+ return p;
90
+ }
91
+
92
+
93
+ namespace cv
94
+ {
95
+
96
+ BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
97
+ BaseRowFilter::~BaseRowFilter() {}
98
+
99
+ BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
100
+ BaseColumnFilter::~BaseColumnFilter() {}
101
+ void BaseColumnFilter::reset() {}
102
+
103
+ BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
104
+ BaseFilter::~BaseFilter() {}
105
+ void BaseFilter::reset() {}
106
+
107
+ FilterEngine::FilterEngine()
108
+ {
109
+ srcType = dstType = bufType = -1;
110
+ rowBorderType = columnBorderType = BORDER_REPLICATE;
111
+ bufStep = startY = startY0 = endY = rowCount = dstY = 0;
112
+ maxWidth = 0;
113
+
114
+ wholeSize = Size(-1,-1);
115
+ }
116
+
117
+
118
+ FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
119
+ const Ptr<BaseRowFilter>& _rowFilter,
120
+ const Ptr<BaseColumnFilter>& _columnFilter,
121
+ int _srcType, int _dstType, int _bufType,
122
+ int _rowBorderType, int _columnBorderType,
123
+ const Scalar& _borderValue )
124
+ {
125
+ init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
126
+ _rowBorderType, _columnBorderType, _borderValue);
127
+ }
128
+
129
+ FilterEngine::~FilterEngine()
130
+ {
131
+ }
132
+
133
+
134
+ void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
135
+ const Ptr<BaseRowFilter>& _rowFilter,
136
+ const Ptr<BaseColumnFilter>& _columnFilter,
137
+ int _srcType, int _dstType, int _bufType,
138
+ int _rowBorderType, int _columnBorderType,
139
+ const Scalar& _borderValue )
140
+ {
141
+ _srcType = CV_MAT_TYPE(_srcType);
142
+ _bufType = CV_MAT_TYPE(_bufType);
143
+ _dstType = CV_MAT_TYPE(_dstType);
144
+
145
+ srcType = _srcType;
146
+ int srcElemSize = (int)getElemSize(srcType);
147
+ dstType = _dstType;
148
+ bufType = _bufType;
149
+
150
+ filter2D = _filter2D;
151
+ rowFilter = _rowFilter;
152
+ columnFilter = _columnFilter;
153
+
154
+ if( _columnBorderType < 0 )
155
+ _columnBorderType = _rowBorderType;
156
+
157
+ rowBorderType = _rowBorderType;
158
+ columnBorderType = _columnBorderType;
159
+
160
+ CV_Assert( columnBorderType != BORDER_WRAP );
161
+
162
+ if( isSeparable() )
163
+ {
164
+ CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
165
+ ksize = Size(rowFilter->ksize, columnFilter->ksize);
166
+ anchor = Point(rowFilter->anchor, columnFilter->anchor);
167
+ }
168
+ else
169
+ {
170
+ CV_Assert( bufType == srcType );
171
+ ksize = filter2D->ksize;
172
+ anchor = filter2D->anchor;
173
+ }
174
+
175
+ CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
176
+ 0 <= anchor.y && anchor.y < ksize.height );
177
+
178
+ borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
179
+ int borderLength = std::max(ksize.width - 1, 1);
180
+ borderTab.resize(borderLength*borderElemSize);
181
+
182
+ maxWidth = bufStep = 0;
183
+ constBorderRow.clear();
184
+
185
+ if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
186
+ {
187
+ constBorderValue.resize(srcElemSize*borderLength);
188
+ scalarToRawData(_borderValue, &constBorderValue[0], srcType,
189
+ borderLength*CV_MAT_CN(srcType));
190
+ }
191
+
192
+ wholeSize = Size(-1,-1);
193
+ }
194
+
195
+ static const int VEC_ALIGN = CV_MALLOC_ALIGN;
196
+
197
+ int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
198
+ {
199
+ int i, j;
200
+
201
+ wholeSize = _wholeSize;
202
+ roi = _roi;
203
+ CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
204
+ roi.x + roi.width <= wholeSize.width &&
205
+ roi.y + roi.height <= wholeSize.height );
206
+
207
+ int esz = (int)getElemSize(srcType);
208
+ int bufElemSize = (int)getElemSize(bufType);
209
+ const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
210
+
211
+ if( _maxBufRows < 0 )
212
+ _maxBufRows = ksize.height + 3;
213
+ _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
214
+
215
+ if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
216
+ {
217
+ rows.resize(_maxBufRows);
218
+ maxWidth = std::max(maxWidth, roi.width);
219
+ int cn = CV_MAT_CN(srcType);
220
+ srcRow.resize(esz*(maxWidth + ksize.width - 1));
221
+ if( columnBorderType == BORDER_CONSTANT )
222
+ {
223
+ constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
224
+ uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
225
+ int n = (int)constBorderValue.size(), N;
226
+ N = (maxWidth + ksize.width - 1)*esz;
227
+ tdst = isSeparable() ? &srcRow[0] : dst;
228
+
229
+ for( i = 0; i < N; i += n )
230
+ {
231
+ n = std::min( n, N - i );
232
+ for(j = 0; j < n; j++)
233
+ tdst[i+j] = constVal[j];
234
+ }
235
+
236
+ if( isSeparable() )
237
+ (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
238
+ }
239
+
240
+ int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
241
+ (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
242
+ ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
243
+ }
244
+
245
+ // adjust bufstep so that the used part of the ring buffer stays compact in memory
246
+ bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
247
+
248
+ dx1 = std::max(anchor.x - roi.x, 0);
249
+ dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
250
+
251
+ // recompute border tables
252
+ if( dx1 > 0 || dx2 > 0 )
253
+ {
254
+ if( rowBorderType == BORDER_CONSTANT )
255
+ {
256
+ int nr = isSeparable() ? 1 : (int)rows.size();
257
+ for( i = 0; i < nr; i++ )
258
+ {
259
+ uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
260
+ memcpy( dst, constVal, dx1*esz );
261
+ memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
262
+ }
263
+ }
264
+ else
265
+ {
266
+ int xofs1 = std::min(roi.x, anchor.x) - roi.x;
267
+
268
+ int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
269
+ int* btab = (int*)&borderTab[0];
270
+
271
+ for( i = 0; i < dx1; i++ )
272
+ {
273
+ int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
274
+ for( j = 0; j < btab_esz; j++ )
275
+ btab[i*btab_esz + j] = p0 + j;
276
+ }
277
+
278
+ for( i = 0; i < dx2; i++ )
279
+ {
280
+ int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
281
+ for( j = 0; j < btab_esz; j++ )
282
+ btab[(i + dx1)*btab_esz + j] = p0 + j;
283
+ }
284
+ }
285
+ }
286
+
287
+ rowCount = dstY = 0;
288
+ startY = startY0 = std::max(roi.y - anchor.y, 0);
289
+ endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
290
+ if( !columnFilter.empty() )
291
+ columnFilter->reset();
292
+ if( !filter2D.empty() )
293
+ filter2D->reset();
294
+
295
+ return startY;
296
+ }
297
+
298
+
299
+ int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
300
+ bool isolated, int maxBufRows)
301
+ {
302
+ Rect srcRoi = _srcRoi;
303
+
304
+ if( srcRoi == Rect(0,0,-1,-1) )
305
+ srcRoi = Rect(0,0,src.cols,src.rows);
306
+
307
+ CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
308
+ srcRoi.width >= 0 && srcRoi.height >= 0 &&
309
+ srcRoi.x + srcRoi.width <= src.cols &&
310
+ srcRoi.y + srcRoi.height <= src.rows );
311
+
312
+ Point ofs;
313
+ Size wholeSize(src.cols, src.rows);
314
+ if( !isolated )
315
+ src.locateROI( wholeSize, ofs );
316
+ start( wholeSize, srcRoi + ofs, maxBufRows );
317
+
318
+ return startY - ofs.y;
319
+ }
320
+
321
+
322
+ int FilterEngine::remainingInputRows() const
323
+ {
324
+ return endY - startY - rowCount;
325
+ }
326
+
327
+ int FilterEngine::remainingOutputRows() const
328
+ {
329
+ return roi.height - dstY;
330
+ }
331
+
332
+ int FilterEngine::proceed( const uchar* src, int srcstep, int count,
333
+ uchar* dst, int dststep )
334
+ {
335
+ CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
336
+
337
+ const int *btab = &borderTab[0];
338
+ int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
339
+ uchar** brows = &rows[0];
340
+ int bufRows = (int)rows.size();
341
+ int cn = CV_MAT_CN(bufType);
342
+ int width = roi.width, kwidth = ksize.width;
343
+ int kheight = ksize.height, ay = anchor.y;
344
+ int _dx1 = dx1, _dx2 = dx2;
345
+ int width1 = roi.width + kwidth - 1;
346
+ int xofs1 = std::min(roi.x, anchor.x);
347
+ bool isSep = isSeparable();
348
+ bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
349
+ int dy = 0, i = 0;
350
+
351
+ src -= xofs1*esz;
352
+ count = std::min(count, remainingInputRows());
353
+
354
+ CV_Assert( src && dst && count > 0 );
355
+
356
+ for(;; dst += dststep*i, dy += i)
357
+ {
358
+ int dcount = bufRows - ay - startY - rowCount + roi.y;
359
+ dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
360
+ dcount = std::min(dcount, count);
361
+ count -= dcount;
362
+ for( ; dcount-- > 0; src += srcstep )
363
+ {
364
+ int bi = (startY - startY0 + rowCount) % bufRows;
365
+ uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
366
+ uchar* row = isSep ? &srcRow[0] : brow;
367
+
368
+ if( ++rowCount > bufRows )
369
+ {
370
+ --rowCount;
371
+ ++startY;
372
+ }
373
+
374
+ memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
375
+
376
+ if( makeBorder )
377
+ {
378
+ if( btab_esz*(int)sizeof(int) == esz )
379
+ {
380
+ const int* isrc = (const int*)src;
381
+ int* irow = (int*)row;
382
+
383
+ for( i = 0; i < _dx1*btab_esz; i++ )
384
+ irow[i] = isrc[btab[i]];
385
+ for( i = 0; i < _dx2*btab_esz; i++ )
386
+ irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
387
+ }
388
+ else
389
+ {
390
+ for( i = 0; i < _dx1*esz; i++ )
391
+ row[i] = src[btab[i]];
392
+ for( i = 0; i < _dx2*esz; i++ )
393
+ row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
394
+ }
395
+ }
396
+
397
+ if( isSep )
398
+ (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
399
+ }
400
+
401
+ int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
402
+ for( i = 0; i < max_i; i++ )
403
+ {
404
+ int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
405
+ wholeSize.height, columnBorderType);
406
+ if( srcY < 0 ) // can happen only with constant border type
407
+ brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
408
+ else
409
+ {
410
+ CV_Assert( srcY >= startY );
411
+ if( srcY >= startY + rowCount )
412
+ break;
413
+ int bi = (srcY - startY0) % bufRows;
414
+ brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
415
+ }
416
+ }
417
+ if( i < kheight )
418
+ break;
419
+ i -= kheight - 1;
420
+ if( isSeparable() )
421
+ (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
422
+ else
423
+ (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
424
+ }
425
+
426
+ dstY += dy;
427
+ CV_Assert( dstY <= roi.height );
428
+ return dy;
429
+ }
430
+
431
+
432
+ void FilterEngine::apply(const Mat& src, Mat& dst,
433
+ const Rect& _srcRoi, Point dstOfs, bool isolated)
434
+ {
435
+ CV_Assert( src.type() == srcType && dst.type() == dstType );
436
+
437
+ Rect srcRoi = _srcRoi;
438
+ if( srcRoi == Rect(0,0,-1,-1) )
439
+ srcRoi = Rect(0,0,src.cols,src.rows);
440
+
441
+ if( srcRoi.area() == 0 )
442
+ return;
443
+
444
+ CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
445
+ dstOfs.x + srcRoi.width <= dst.cols &&
446
+ dstOfs.y + srcRoi.height <= dst.rows );
447
+
448
+ int y = start(src, srcRoi, isolated);
449
+ proceed( src.data + y*src.step, (int)src.step, endY - startY,
450
+ dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step );
451
+ }
452
+
453
+ }
454
+
455
+ /****************************************************************************************\
456
+ * Separable linear filter *
457
+ \****************************************************************************************/
458
+
459
+ int cv::getKernelType(InputArray filter_kernel, Point anchor)
460
+ {
461
+ Mat _kernel = filter_kernel.getMat();
462
+ CV_Assert( _kernel.channels() == 1 );
463
+ int i, sz = _kernel.rows*_kernel.cols;
464
+
465
+ Mat kernel;
466
+ _kernel.convertTo(kernel, CV_64F);
467
+
468
+ const double* coeffs = (double*)kernel.data;
469
+ double sum = 0;
470
+ int type = KERNEL_SMOOTH + KERNEL_INTEGER;
471
+ if( (_kernel.rows == 1 || _kernel.cols == 1) &&
472
+ anchor.x*2 + 1 == _kernel.cols &&
473
+ anchor.y*2 + 1 == _kernel.rows )
474
+ type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
475
+
476
+ for( i = 0; i < sz; i++ )
477
+ {
478
+ double a = coeffs[i], b = coeffs[sz - i - 1];
479
+ if( a != b )
480
+ type &= ~KERNEL_SYMMETRICAL;
481
+ if( a != -b )
482
+ type &= ~KERNEL_ASYMMETRICAL;
483
+ if( a < 0 )
484
+ type &= ~KERNEL_SMOOTH;
485
+ if( a != saturate_cast<int>(a) )
486
+ type &= ~KERNEL_INTEGER;
487
+ sum += a;
488
+ }
489
+
490
+ if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
491
+ type &= ~KERNEL_SMOOTH;
492
+ return type;
493
+ }
494
+
495
+
496
+ namespace cv
497
+ {
498
+
499
+ struct RowNoVec
500
+ {
501
+ RowNoVec() {}
502
+ RowNoVec(const Mat&) {}
503
+ int operator()(const uchar*, uchar*, int, int) const { return 0; }
504
+ };
505
+
506
+ struct ColumnNoVec
507
+ {
508
+ ColumnNoVec() {}
509
+ ColumnNoVec(const Mat&, int, int, double) {}
510
+ int operator()(const uchar**, uchar*, int) const { return 0; }
511
+ };
512
+
513
+ struct SymmRowSmallNoVec
514
+ {
515
+ SymmRowSmallNoVec() {}
516
+ SymmRowSmallNoVec(const Mat&, int) {}
517
+ int operator()(const uchar*, uchar*, int, int) const { return 0; }
518
+ };
519
+
520
+ struct SymmColumnSmallNoVec
521
+ {
522
+ SymmColumnSmallNoVec() {}
523
+ SymmColumnSmallNoVec(const Mat&, int, int, double) {}
524
+ int operator()(const uchar**, uchar*, int) const { return 0; }
525
+ };
526
+
527
+ struct FilterNoVec
528
+ {
529
+ FilterNoVec() {}
530
+ FilterNoVec(const Mat&, int, double) {}
531
+ int operator()(const uchar**, uchar*, int) const { return 0; }
532
+ };
533
+
534
+
535
+ #if CV_SSE2
536
+
537
+ ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
538
+
539
+ struct RowVec_8u32s
540
+ {
541
+ RowVec_8u32s() { smallValues = false; }
542
+ RowVec_8u32s( const Mat& _kernel )
543
+ {
544
+ kernel = _kernel;
545
+ smallValues = true;
546
+ int k, ksize = kernel.rows + kernel.cols - 1;
547
+ for( k = 0; k < ksize; k++ )
548
+ {
549
+ int v = ((const int*)kernel.data)[k];
550
+ if( v < SHRT_MIN || v > SHRT_MAX )
551
+ {
552
+ smallValues = false;
553
+ break;
554
+ }
555
+ }
556
+ }
557
+
558
+ int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
559
+ {
560
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
561
+ return 0;
562
+
563
+ int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
564
+ int* dst = (int*)_dst;
565
+ const int* _kx = (const int*)kernel.data;
566
+ width *= cn;
567
+
568
+ if( smallValues )
569
+ {
570
+ for( ; i <= width - 16; i += 16 )
571
+ {
572
+ const uchar* src = _src + i;
573
+ __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
574
+ __m128i x0, x1, x2, x3;
575
+
576
+ for( k = 0; k < _ksize; k++, src += cn )
577
+ {
578
+ f = _mm_cvtsi32_si128(_kx[k]);
579
+ f = _mm_shuffle_epi32(f, 0);
580
+ f = _mm_packs_epi32(f, f);
581
+
582
+ x0 = _mm_loadu_si128((const __m128i*)src);
583
+ x2 = _mm_unpackhi_epi8(x0, z);
584
+ x0 = _mm_unpacklo_epi8(x0, z);
585
+ x1 = _mm_mulhi_epi16(x0, f);
586
+ x3 = _mm_mulhi_epi16(x2, f);
587
+ x0 = _mm_mullo_epi16(x0, f);
588
+ x2 = _mm_mullo_epi16(x2, f);
589
+
590
+ s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
591
+ s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
592
+ s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
593
+ s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
594
+ }
595
+
596
+ _mm_store_si128((__m128i*)(dst + i), s0);
597
+ _mm_store_si128((__m128i*)(dst + i + 4), s1);
598
+ _mm_store_si128((__m128i*)(dst + i + 8), s2);
599
+ _mm_store_si128((__m128i*)(dst + i + 12), s3);
600
+ }
601
+
602
+ for( ; i <= width - 4; i += 4 )
603
+ {
604
+ const uchar* src = _src + i;
605
+ __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
606
+
607
+ for( k = 0; k < _ksize; k++, src += cn )
608
+ {
609
+ f = _mm_cvtsi32_si128(_kx[k]);
610
+ f = _mm_shuffle_epi32(f, 0);
611
+ f = _mm_packs_epi32(f, f);
612
+
613
+ x0 = _mm_cvtsi32_si128(*(const int*)src);
614
+ x0 = _mm_unpacklo_epi8(x0, z);
615
+ x1 = _mm_mulhi_epi16(x0, f);
616
+ x0 = _mm_mullo_epi16(x0, f);
617
+ s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
618
+ }
619
+ _mm_store_si128((__m128i*)(dst + i), s0);
620
+ }
621
+ }
622
+ return i;
623
+ }
624
+
625
+ Mat kernel;
626
+ bool smallValues;
627
+ };
628
+
629
+
630
+ struct SymmRowSmallVec_8u32s
631
+ {
632
+ SymmRowSmallVec_8u32s() { smallValues = false; }
633
+ SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
634
+ {
635
+ kernel = _kernel;
636
+ symmetryType = _symmetryType;
637
+ smallValues = true;
638
+ int k, ksize = kernel.rows + kernel.cols - 1;
639
+ for( k = 0; k < ksize; k++ )
640
+ {
641
+ int v = ((const int*)kernel.data)[k];
642
+ if( v < SHRT_MIN || v > SHRT_MAX )
643
+ {
644
+ smallValues = false;
645
+ break;
646
+ }
647
+ }
648
+ }
649
+
650
+ int operator()(const uchar* src, uchar* _dst, int width, int cn) const
651
+ {
652
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
653
+ return 0;
654
+
655
+ int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
656
+ int* dst = (int*)_dst;
657
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
658
+ const int* kx = (const int*)kernel.data + _ksize/2;
659
+ if( !smallValues )
660
+ return 0;
661
+
662
+ src += (_ksize/2)*cn;
663
+ width *= cn;
664
+
665
+ __m128i z = _mm_setzero_si128();
666
+ if( symmetrical )
667
+ {
668
+ if( _ksize == 1 )
669
+ return 0;
670
+ if( _ksize == 3 )
671
+ {
672
+ if( kx[0] == 2 && kx[1] == 1 )
673
+ for( ; i <= width - 16; i += 16, src += 16 )
674
+ {
675
+ __m128i x0, x1, x2, y0, y1, y2;
676
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
677
+ x1 = _mm_loadu_si128((__m128i*)src);
678
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
679
+ y0 = _mm_unpackhi_epi8(x0, z);
680
+ x0 = _mm_unpacklo_epi8(x0, z);
681
+ y1 = _mm_unpackhi_epi8(x1, z);
682
+ x1 = _mm_unpacklo_epi8(x1, z);
683
+ y2 = _mm_unpackhi_epi8(x2, z);
684
+ x2 = _mm_unpacklo_epi8(x2, z);
685
+ x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
686
+ y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
687
+ _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
688
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
689
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
690
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
691
+ }
692
+ else if( kx[0] == -2 && kx[1] == 1 )
693
+ for( ; i <= width - 16; i += 16, src += 16 )
694
+ {
695
+ __m128i x0, x1, x2, y0, y1, y2;
696
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
697
+ x1 = _mm_loadu_si128((__m128i*)src);
698
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
699
+ y0 = _mm_unpackhi_epi8(x0, z);
700
+ x0 = _mm_unpacklo_epi8(x0, z);
701
+ y1 = _mm_unpackhi_epi8(x1, z);
702
+ x1 = _mm_unpacklo_epi8(x1, z);
703
+ y2 = _mm_unpackhi_epi8(x2, z);
704
+ x2 = _mm_unpacklo_epi8(x2, z);
705
+ x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
706
+ y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
707
+ _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
708
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
709
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
710
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
711
+ }
712
+ else
713
+ {
714
+ __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
715
+ k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
716
+ k0 = _mm_packs_epi32(k0, k0);
717
+ k1 = _mm_packs_epi32(k1, k1);
718
+
719
+ for( ; i <= width - 16; i += 16, src += 16 )
720
+ {
721
+ __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
722
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
723
+ x1 = _mm_loadu_si128((__m128i*)src);
724
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
725
+ y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
726
+ x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
727
+ y1 = _mm_unpackhi_epi8(x1, z);
728
+ x1 = _mm_unpacklo_epi8(x1, z);
729
+
730
+ t1 = _mm_mulhi_epi16(x1, k0);
731
+ t0 = _mm_mullo_epi16(x1, k0);
732
+ x2 = _mm_mulhi_epi16(x0, k1);
733
+ x0 = _mm_mullo_epi16(x0, k1);
734
+ z0 = _mm_unpacklo_epi16(t0, t1);
735
+ z1 = _mm_unpackhi_epi16(t0, t1);
736
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
737
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
738
+
739
+ t1 = _mm_mulhi_epi16(y1, k0);
740
+ t0 = _mm_mullo_epi16(y1, k0);
741
+ y1 = _mm_mulhi_epi16(y0, k1);
742
+ y0 = _mm_mullo_epi16(y0, k1);
743
+ z2 = _mm_unpacklo_epi16(t0, t1);
744
+ z3 = _mm_unpackhi_epi16(t0, t1);
745
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
746
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
747
+ _mm_store_si128((__m128i*)(dst + i), z0);
748
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
749
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
750
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
751
+ }
752
+ }
753
+ }
754
+ else if( _ksize == 5 )
755
+ {
756
+ if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
757
+ for( ; i <= width - 16; i += 16, src += 16 )
758
+ {
759
+ __m128i x0, x1, x2, y0, y1, y2;
760
+ x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
761
+ x1 = _mm_loadu_si128((__m128i*)src);
762
+ x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
763
+ y0 = _mm_unpackhi_epi8(x0, z);
764
+ x0 = _mm_unpacklo_epi8(x0, z);
765
+ y1 = _mm_unpackhi_epi8(x1, z);
766
+ x1 = _mm_unpacklo_epi8(x1, z);
767
+ y2 = _mm_unpackhi_epi8(x2, z);
768
+ x2 = _mm_unpacklo_epi8(x2, z);
769
+ x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
770
+ y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
771
+ _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
772
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
773
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
774
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
775
+ }
776
+ else
777
+ {
778
+ __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
779
+ k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
780
+ k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
781
+ k0 = _mm_packs_epi32(k0, k0);
782
+ k1 = _mm_packs_epi32(k1, k1);
783
+ k2 = _mm_packs_epi32(k2, k2);
784
+
785
+ for( ; i <= width - 16; i += 16, src += 16 )
786
+ {
787
+ __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
788
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
789
+ x1 = _mm_loadu_si128((__m128i*)src);
790
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
791
+ y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
792
+ x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
793
+ y1 = _mm_unpackhi_epi8(x1, z);
794
+ x1 = _mm_unpacklo_epi8(x1, z);
795
+
796
+ t1 = _mm_mulhi_epi16(x1, k0);
797
+ t0 = _mm_mullo_epi16(x1, k0);
798
+ x2 = _mm_mulhi_epi16(x0, k1);
799
+ x0 = _mm_mullo_epi16(x0, k1);
800
+ z0 = _mm_unpacklo_epi16(t0, t1);
801
+ z1 = _mm_unpackhi_epi16(t0, t1);
802
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
803
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
804
+
805
+ t1 = _mm_mulhi_epi16(y1, k0);
806
+ t0 = _mm_mullo_epi16(y1, k0);
807
+ y1 = _mm_mulhi_epi16(y0, k1);
808
+ y0 = _mm_mullo_epi16(y0, k1);
809
+ z2 = _mm_unpacklo_epi16(t0, t1);
810
+ z3 = _mm_unpackhi_epi16(t0, t1);
811
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
812
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
813
+
814
+ x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
815
+ x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
816
+ y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
817
+ y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
818
+
819
+ t1 = _mm_mulhi_epi16(y0, k2);
820
+ t0 = _mm_mullo_epi16(y0, k2);
821
+ y0 = _mm_mullo_epi16(y1, k2);
822
+ y1 = _mm_mulhi_epi16(y1, k2);
823
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
824
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
825
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
826
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
827
+
828
+ _mm_store_si128((__m128i*)(dst + i), z0);
829
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
830
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
831
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
832
+ }
833
+ }
834
+ }
835
+ }
836
+ else
837
+ {
838
+ if( _ksize == 3 )
839
+ {
840
+ if( kx[0] == 0 && kx[1] == 1 )
841
+ for( ; i <= width - 16; i += 16, src += 16 )
842
+ {
843
+ __m128i x0, x1, y0;
844
+ x0 = _mm_loadu_si128((__m128i*)(src + cn));
845
+ x1 = _mm_loadu_si128((__m128i*)(src - cn));
846
+ y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
847
+ x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
848
+ _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
849
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
850
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
851
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
852
+ }
853
+ else
854
+ {
855
+ __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
856
+ k1 = _mm_packs_epi32(k1, k1);
857
+
858
+ for( ; i <= width - 16; i += 16, src += 16 )
859
+ {
860
+ __m128i x0, x1, y0, y1, z0, z1, z2, z3;
861
+ x0 = _mm_loadu_si128((__m128i*)(src + cn));
862
+ x1 = _mm_loadu_si128((__m128i*)(src - cn));
863
+ y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
864
+ x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
865
+
866
+ x1 = _mm_mulhi_epi16(x0, k1);
867
+ x0 = _mm_mullo_epi16(x0, k1);
868
+ z0 = _mm_unpacklo_epi16(x0, x1);
869
+ z1 = _mm_unpackhi_epi16(x0, x1);
870
+
871
+ y1 = _mm_mulhi_epi16(y0, k1);
872
+ y0 = _mm_mullo_epi16(y0, k1);
873
+ z2 = _mm_unpacklo_epi16(y0, y1);
874
+ z3 = _mm_unpackhi_epi16(y0, y1);
875
+ _mm_store_si128((__m128i*)(dst + i), z0);
876
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
877
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
878
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
879
+ }
880
+ }
881
+ }
882
+ else if( _ksize == 5 )
883
+ {
884
+ __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
885
+ k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
886
+ k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
887
+ k0 = _mm_packs_epi32(k0, k0);
888
+ k1 = _mm_packs_epi32(k1, k1);
889
+ k2 = _mm_packs_epi32(k2, k2);
890
+
891
+ for( ; i <= width - 16; i += 16, src += 16 )
892
+ {
893
+ __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
894
+ x0 = _mm_loadu_si128((__m128i*)(src + cn));
895
+ x2 = _mm_loadu_si128((__m128i*)(src - cn));
896
+ y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
897
+ x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
898
+
899
+ x2 = _mm_mulhi_epi16(x0, k1);
900
+ x0 = _mm_mullo_epi16(x0, k1);
901
+ z0 = _mm_unpacklo_epi16(x0, x2);
902
+ z1 = _mm_unpackhi_epi16(x0, x2);
903
+ y1 = _mm_mulhi_epi16(y0, k1);
904
+ y0 = _mm_mullo_epi16(y0, k1);
905
+ z2 = _mm_unpacklo_epi16(y0, y1);
906
+ z3 = _mm_unpackhi_epi16(y0, y1);
907
+
908
+ x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
909
+ x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
910
+ y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
911
+ y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
912
+
913
+ t1 = _mm_mulhi_epi16(y0, k2);
914
+ t0 = _mm_mullo_epi16(y0, k2);
915
+ y0 = _mm_mullo_epi16(y1, k2);
916
+ y1 = _mm_mulhi_epi16(y1, k2);
917
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
918
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
919
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
920
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
921
+
922
+ _mm_store_si128((__m128i*)(dst + i), z0);
923
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
924
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
925
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
926
+ }
927
+ }
928
+ }
929
+
930
+ src -= (_ksize/2)*cn;
931
+ kx -= _ksize/2;
932
+ for( ; i <= width - 4; i += 4, src += 4 )
933
+ {
934
+ __m128i f, s0 = z, x0, x1;
935
+
936
+ for( k = j = 0; k < _ksize; k++, j += cn )
937
+ {
938
+ f = _mm_cvtsi32_si128(kx[k]);
939
+ f = _mm_shuffle_epi32(f, 0);
940
+ f = _mm_packs_epi32(f, f);
941
+
942
+ x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
943
+ x0 = _mm_unpacklo_epi8(x0, z);
944
+ x1 = _mm_mulhi_epi16(x0, f);
945
+ x0 = _mm_mullo_epi16(x0, f);
946
+ s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
947
+ }
948
+ _mm_store_si128((__m128i*)(dst + i), s0);
949
+ }
950
+
951
+ return i;
952
+ }
953
+
954
+ Mat kernel;
955
+ int symmetryType;
956
+ bool smallValues;
957
+ };
958
+
959
+
960
+ struct SymmColumnVec_32s8u
961
+ {
962
+ SymmColumnVec_32s8u() { symmetryType=0; }
963
+ SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
964
+ {
965
+ symmetryType = _symmetryType;
966
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
967
+ delta = (float)(_delta/(1 << _bits));
968
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
969
+ }
970
+
971
+ int operator()(const uchar** _src, uchar* dst, int width) const
972
+ {
973
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
974
+ return 0;
975
+
976
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
977
+ const float* ky = (const float*)kernel.data + ksize2;
978
+ int i = 0, k;
979
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
980
+ const int** src = (const int**)_src;
981
+ const __m128i *S, *S2;
982
+ __m128 d4 = _mm_set1_ps(delta);
983
+
984
+ if( symmetrical )
985
+ {
986
+ for( ; i <= width - 16; i += 16 )
987
+ {
988
+ __m128 f = _mm_load_ss(ky);
989
+ f = _mm_shuffle_ps(f, f, 0);
990
+ __m128 s0, s1, s2, s3;
991
+ __m128i x0, x1;
992
+ S = (const __m128i*)(src[0] + i);
993
+ s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
994
+ s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
995
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
996
+ s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
997
+ s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
998
+ s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
999
+ s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1000
+ s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1001
+
1002
+ for( k = 1; k <= ksize2; k++ )
1003
+ {
1004
+ S = (const __m128i*)(src[k] + i);
1005
+ S2 = (const __m128i*)(src[-k] + i);
1006
+ f = _mm_load_ss(ky+k);
1007
+ f = _mm_shuffle_ps(f, f, 0);
1008
+ x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1009
+ x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1010
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1011
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1012
+ x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1013
+ x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1014
+ s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1015
+ s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1016
+ }
1017
+
1018
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1019
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1020
+ x0 = _mm_packus_epi16(x0, x1);
1021
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1022
+ }
1023
+
1024
+ for( ; i <= width - 4; i += 4 )
1025
+ {
1026
+ __m128 f = _mm_load_ss(ky);
1027
+ f = _mm_shuffle_ps(f, f, 0);
1028
+ __m128i x0;
1029
+ __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
1030
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1031
+
1032
+ for( k = 1; k <= ksize2; k++ )
1033
+ {
1034
+ S = (const __m128i*)(src[k] + i);
1035
+ S2 = (const __m128i*)(src[-k] + i);
1036
+ f = _mm_load_ss(ky+k);
1037
+ f = _mm_shuffle_ps(f, f, 0);
1038
+ x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1039
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1040
+ }
1041
+
1042
+ x0 = _mm_cvtps_epi32(s0);
1043
+ x0 = _mm_packs_epi32(x0, x0);
1044
+ x0 = _mm_packus_epi16(x0, x0);
1045
+ *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1046
+ }
1047
+ }
1048
+ else
1049
+ {
1050
+ for( ; i <= width - 16; i += 16 )
1051
+ {
1052
+ __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1053
+ __m128i x0, x1;
1054
+
1055
+ for( k = 1; k <= ksize2; k++ )
1056
+ {
1057
+ S = (const __m128i*)(src[k] + i);
1058
+ S2 = (const __m128i*)(src[-k] + i);
1059
+ f = _mm_load_ss(ky+k);
1060
+ f = _mm_shuffle_ps(f, f, 0);
1061
+ x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1062
+ x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1063
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1064
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1065
+ x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1066
+ x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1067
+ s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1068
+ s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1069
+ }
1070
+
1071
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1072
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1073
+ x0 = _mm_packus_epi16(x0, x1);
1074
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1075
+ }
1076
+
1077
+ for( ; i <= width - 4; i += 4 )
1078
+ {
1079
+ __m128 f, s0 = d4;
1080
+ __m128i x0;
1081
+
1082
+ for( k = 1; k <= ksize2; k++ )
1083
+ {
1084
+ S = (const __m128i*)(src[k] + i);
1085
+ S2 = (const __m128i*)(src[-k] + i);
1086
+ f = _mm_load_ss(ky+k);
1087
+ f = _mm_shuffle_ps(f, f, 0);
1088
+ x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1089
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1090
+ }
1091
+
1092
+ x0 = _mm_cvtps_epi32(s0);
1093
+ x0 = _mm_packs_epi32(x0, x0);
1094
+ x0 = _mm_packus_epi16(x0, x0);
1095
+ *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1096
+ }
1097
+ }
1098
+
1099
+ return i;
1100
+ }
1101
+
1102
+ int symmetryType;
1103
+ float delta;
1104
+ Mat kernel;
1105
+ };
1106
+
1107
+
1108
+ struct SymmColumnSmallVec_32s16s
1109
+ {
1110
+ SymmColumnSmallVec_32s16s() { symmetryType=0; }
1111
+ SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1112
+ {
1113
+ symmetryType = _symmetryType;
1114
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1115
+ delta = (float)(_delta/(1 << _bits));
1116
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1117
+ }
1118
+
1119
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1120
+ {
1121
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
1122
+ return 0;
1123
+
1124
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1125
+ const float* ky = (const float*)kernel.data + ksize2;
1126
+ int i = 0;
1127
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1128
+ const int** src = (const int**)_src;
1129
+ const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1130
+ short* dst = (short*)_dst;
1131
+ __m128 df4 = _mm_set1_ps(delta);
1132
+ __m128i d4 = _mm_cvtps_epi32(df4);
1133
+
1134
+ if( symmetrical )
1135
+ {
1136
+ if( ky[0] == 2 && ky[1] == 1 )
1137
+ {
1138
+ for( ; i <= width - 8; i += 8 )
1139
+ {
1140
+ __m128i s0, s1, s2, s3, s4, s5;
1141
+ s0 = _mm_load_si128((__m128i*)(S0 + i));
1142
+ s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1143
+ s2 = _mm_load_si128((__m128i*)(S1 + i));
1144
+ s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1145
+ s4 = _mm_load_si128((__m128i*)(S2 + i));
1146
+ s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1147
+ s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1148
+ s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1149
+ s0 = _mm_add_epi32(s0, d4);
1150
+ s1 = _mm_add_epi32(s1, d4);
1151
+ _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1152
+ }
1153
+ }
1154
+ else if( ky[0] == -2 && ky[1] == 1 )
1155
+ {
1156
+ for( ; i <= width - 8; i += 8 )
1157
+ {
1158
+ __m128i s0, s1, s2, s3, s4, s5;
1159
+ s0 = _mm_load_si128((__m128i*)(S0 + i));
1160
+ s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1161
+ s2 = _mm_load_si128((__m128i*)(S1 + i));
1162
+ s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1163
+ s4 = _mm_load_si128((__m128i*)(S2 + i));
1164
+ s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1165
+ s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1166
+ s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1167
+ s0 = _mm_add_epi32(s0, d4);
1168
+ s1 = _mm_add_epi32(s1, d4);
1169
+ _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1170
+ }
1171
+ }
1172
+ else
1173
+ {
1174
+ __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1175
+ for( ; i <= width - 8; i += 8 )
1176
+ {
1177
+ __m128 s0, s1;
1178
+ s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1179
+ s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1180
+ s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1181
+ s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1182
+ __m128i x0, x1;
1183
+ x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1184
+ _mm_load_si128((__m128i*)(S2 + i)));
1185
+ x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1186
+ _mm_load_si128((__m128i*)(S2 + i + 4)));
1187
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1188
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1189
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1190
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1191
+ }
1192
+ }
1193
+ }
1194
+ else
1195
+ {
1196
+ if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1197
+ {
1198
+ if( ky[1] < 0 )
1199
+ std::swap(S0, S2);
1200
+ for( ; i <= width - 8; i += 8 )
1201
+ {
1202
+ __m128i s0, s1, s2, s3;
1203
+ s0 = _mm_load_si128((__m128i*)(S2 + i));
1204
+ s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1205
+ s2 = _mm_load_si128((__m128i*)(S0 + i));
1206
+ s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1207
+ s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1208
+ s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1209
+ _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1210
+ }
1211
+ }
1212
+ else
1213
+ {
1214
+ __m128 k1 = _mm_set1_ps(ky[1]);
1215
+ for( ; i <= width - 8; i += 8 )
1216
+ {
1217
+ __m128 s0 = df4, s1 = df4;
1218
+ __m128i x0, x1;
1219
+ x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1220
+ _mm_load_si128((__m128i*)(S2 + i)));
1221
+ x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1222
+ _mm_load_si128((__m128i*)(S2 + i + 4)));
1223
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1224
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1225
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1226
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1227
+ }
1228
+ }
1229
+ }
1230
+
1231
+ return i;
1232
+ }
1233
+
1234
+ int symmetryType;
1235
+ float delta;
1236
+ Mat kernel;
1237
+ };
1238
+
1239
+
1240
+ /////////////////////////////////////// 32f //////////////////////////////////
1241
+
1242
+ struct RowVec_32f
1243
+ {
1244
+ RowVec_32f() {}
1245
+ RowVec_32f( const Mat& _kernel )
1246
+ {
1247
+ kernel = _kernel;
1248
+ }
1249
+
1250
+ int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1251
+ {
1252
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1253
+ return 0;
1254
+
1255
+ int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1256
+ float* dst = (float*)_dst;
1257
+ const float* _kx = (const float*)kernel.data;
1258
+ width *= cn;
1259
+
1260
+ for( ; i <= width - 8; i += 8 )
1261
+ {
1262
+ const float* src = (const float*)_src + i;
1263
+ __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1264
+ for( k = 0; k < _ksize; k++, src += cn )
1265
+ {
1266
+ f = _mm_load_ss(_kx+k);
1267
+ f = _mm_shuffle_ps(f, f, 0);
1268
+
1269
+ x0 = _mm_loadu_ps(src);
1270
+ x1 = _mm_loadu_ps(src + 4);
1271
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1272
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1273
+ }
1274
+ _mm_store_ps(dst + i, s0);
1275
+ _mm_store_ps(dst + i + 4, s1);
1276
+ }
1277
+ return i;
1278
+ }
1279
+
1280
+ Mat kernel;
1281
+ };
1282
+
1283
+
1284
+ struct SymmRowSmallVec_32f
1285
+ {
1286
+ SymmRowSmallVec_32f() {}
1287
+ SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1288
+ {
1289
+ kernel = _kernel;
1290
+ symmetryType = _symmetryType;
1291
+ }
1292
+
1293
+ int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1294
+ {
1295
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1296
+ return 0;
1297
+
1298
+ int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1299
+ float* dst = (float*)_dst;
1300
+ const float* src = (const float*)_src + (_ksize/2)*cn;
1301
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1302
+ const float* kx = (const float*)kernel.data + _ksize/2;
1303
+ width *= cn;
1304
+
1305
+ if( symmetrical )
1306
+ {
1307
+ if( _ksize == 1 )
1308
+ return 0;
1309
+ if( _ksize == 3 )
1310
+ {
1311
+ if( kx[0] == 2 && kx[1] == 1 )
1312
+ for( ; i <= width - 8; i += 8, src += 8 )
1313
+ {
1314
+ __m128 x0, x1, x2, y0, y1, y2;
1315
+ x0 = _mm_loadu_ps(src - cn);
1316
+ x1 = _mm_loadu_ps(src);
1317
+ x2 = _mm_loadu_ps(src + cn);
1318
+ y0 = _mm_loadu_ps(src - cn + 4);
1319
+ y1 = _mm_loadu_ps(src + 4);
1320
+ y2 = _mm_loadu_ps(src + cn + 4);
1321
+ x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1322
+ y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1323
+ _mm_store_ps(dst + i, x0);
1324
+ _mm_store_ps(dst + i + 4, y0);
1325
+ }
1326
+ else if( kx[0] == -2 && kx[1] == 1 )
1327
+ for( ; i <= width - 8; i += 8, src += 8 )
1328
+ {
1329
+ __m128 x0, x1, x2, y0, y1, y2;
1330
+ x0 = _mm_loadu_ps(src - cn);
1331
+ x1 = _mm_loadu_ps(src);
1332
+ x2 = _mm_loadu_ps(src + cn);
1333
+ y0 = _mm_loadu_ps(src - cn + 4);
1334
+ y1 = _mm_loadu_ps(src + 4);
1335
+ y2 = _mm_loadu_ps(src + cn + 4);
1336
+ x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1337
+ y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1338
+ _mm_store_ps(dst + i, x0);
1339
+ _mm_store_ps(dst + i + 4, y0);
1340
+ }
1341
+ else
1342
+ {
1343
+ __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1344
+ for( ; i <= width - 8; i += 8, src += 8 )
1345
+ {
1346
+ __m128 x0, x1, x2, y0, y1, y2;
1347
+ x0 = _mm_loadu_ps(src - cn);
1348
+ x1 = _mm_loadu_ps(src);
1349
+ x2 = _mm_loadu_ps(src + cn);
1350
+ y0 = _mm_loadu_ps(src - cn + 4);
1351
+ y1 = _mm_loadu_ps(src + 4);
1352
+ y2 = _mm_loadu_ps(src + cn + 4);
1353
+
1354
+ x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1355
+ y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1356
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1357
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1358
+ _mm_store_ps(dst + i, x0);
1359
+ _mm_store_ps(dst + i + 4, y0);
1360
+ }
1361
+ }
1362
+ }
1363
+ else if( _ksize == 5 )
1364
+ {
1365
+ if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1366
+ for( ; i <= width - 8; i += 8, src += 8 )
1367
+ {
1368
+ __m128 x0, x1, x2, y0, y1, y2;
1369
+ x0 = _mm_loadu_ps(src - cn*2);
1370
+ x1 = _mm_loadu_ps(src);
1371
+ x2 = _mm_loadu_ps(src + cn*2);
1372
+ y0 = _mm_loadu_ps(src - cn*2 + 4);
1373
+ y1 = _mm_loadu_ps(src + 4);
1374
+ y2 = _mm_loadu_ps(src + cn*2 + 4);
1375
+ x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1376
+ y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1377
+ _mm_store_ps(dst + i, x0);
1378
+ _mm_store_ps(dst + i + 4, y0);
1379
+ }
1380
+ else
1381
+ {
1382
+ __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1383
+ for( ; i <= width - 8; i += 8, src += 8 )
1384
+ {
1385
+ __m128 x0, x1, x2, y0, y1, y2;
1386
+ x0 = _mm_loadu_ps(src - cn);
1387
+ x1 = _mm_loadu_ps(src);
1388
+ x2 = _mm_loadu_ps(src + cn);
1389
+ y0 = _mm_loadu_ps(src - cn + 4);
1390
+ y1 = _mm_loadu_ps(src + 4);
1391
+ y2 = _mm_loadu_ps(src + cn + 4);
1392
+
1393
+ x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1394
+ y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1395
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1396
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1397
+
1398
+ x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1399
+ y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1400
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1401
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1402
+
1403
+ _mm_store_ps(dst + i, x0);
1404
+ _mm_store_ps(dst + i + 4, y0);
1405
+ }
1406
+ }
1407
+ }
1408
+ }
1409
+ else
1410
+ {
1411
+ if( _ksize == 3 )
1412
+ {
1413
+ if( kx[0] == 0 && kx[1] == 1 )
1414
+ for( ; i <= width - 8; i += 8, src += 8 )
1415
+ {
1416
+ __m128 x0, x2, y0, y2;
1417
+ x0 = _mm_loadu_ps(src + cn);
1418
+ x2 = _mm_loadu_ps(src - cn);
1419
+ y0 = _mm_loadu_ps(src + cn + 4);
1420
+ y2 = _mm_loadu_ps(src - cn + 4);
1421
+ x0 = _mm_sub_ps(x0, x2);
1422
+ y0 = _mm_sub_ps(y0, y2);
1423
+ _mm_store_ps(dst + i, x0);
1424
+ _mm_store_ps(dst + i + 4, y0);
1425
+ }
1426
+ else
1427
+ {
1428
+ __m128 k1 = _mm_set1_ps(kx[1]);
1429
+ for( ; i <= width - 8; i += 8, src += 8 )
1430
+ {
1431
+ __m128 x0, x2, y0, y2;
1432
+ x0 = _mm_loadu_ps(src + cn);
1433
+ x2 = _mm_loadu_ps(src - cn);
1434
+ y0 = _mm_loadu_ps(src + cn + 4);
1435
+ y2 = _mm_loadu_ps(src - cn + 4);
1436
+
1437
+ x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1438
+ y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1439
+ _mm_store_ps(dst + i, x0);
1440
+ _mm_store_ps(dst + i + 4, y0);
1441
+ }
1442
+ }
1443
+ }
1444
+ else if( _ksize == 5 )
1445
+ {
1446
+ __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1447
+ for( ; i <= width - 8; i += 8, src += 8 )
1448
+ {
1449
+ __m128 x0, x2, y0, y2;
1450
+ x0 = _mm_loadu_ps(src + cn);
1451
+ x2 = _mm_loadu_ps(src - cn);
1452
+ y0 = _mm_loadu_ps(src + cn + 4);
1453
+ y2 = _mm_loadu_ps(src - cn + 4);
1454
+
1455
+ x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1456
+ y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1457
+
1458
+ x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1459
+ y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1460
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1461
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1462
+
1463
+ _mm_store_ps(dst + i, x0);
1464
+ _mm_store_ps(dst + i + 4, y0);
1465
+ }
1466
+ }
1467
+ }
1468
+
1469
+ return i;
1470
+ }
1471
+
1472
+ Mat kernel;
1473
+ int symmetryType;
1474
+ };
1475
+
1476
+
1477
+ struct SymmColumnVec_32f
1478
+ {
1479
+ SymmColumnVec_32f() { symmetryType=0; }
1480
+ SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1481
+ {
1482
+ symmetryType = _symmetryType;
1483
+ kernel = _kernel;
1484
+ delta = (float)_delta;
1485
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1486
+ }
1487
+
1488
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1489
+ {
1490
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1491
+ return 0;
1492
+
1493
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1494
+ const float* ky = (const float*)kernel.data + ksize2;
1495
+ int i = 0, k;
1496
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1497
+ const float** src = (const float**)_src;
1498
+ const float *S, *S2;
1499
+ float* dst = (float*)_dst;
1500
+ __m128 d4 = _mm_set1_ps(delta);
1501
+
1502
+ if( symmetrical )
1503
+ {
1504
+ for( ; i <= width - 16; i += 16 )
1505
+ {
1506
+ __m128 f = _mm_load_ss(ky);
1507
+ f = _mm_shuffle_ps(f, f, 0);
1508
+ __m128 s0, s1, s2, s3;
1509
+ __m128 x0, x1;
1510
+ S = src[0] + i;
1511
+ s0 = _mm_load_ps(S);
1512
+ s1 = _mm_load_ps(S+4);
1513
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1514
+ s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1515
+ s2 = _mm_load_ps(S+8);
1516
+ s3 = _mm_load_ps(S+12);
1517
+ s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1518
+ s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1519
+
1520
+ for( k = 1; k <= ksize2; k++ )
1521
+ {
1522
+ S = src[k] + i;
1523
+ S2 = src[-k] + i;
1524
+ f = _mm_load_ss(ky+k);
1525
+ f = _mm_shuffle_ps(f, f, 0);
1526
+ x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1527
+ x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1528
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1529
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1530
+ x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1531
+ x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1532
+ s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1533
+ s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1534
+ }
1535
+
1536
+ _mm_storeu_ps(dst + i, s0);
1537
+ _mm_storeu_ps(dst + i + 4, s1);
1538
+ _mm_storeu_ps(dst + i + 8, s2);
1539
+ _mm_storeu_ps(dst + i + 12, s3);
1540
+ }
1541
+
1542
+ for( ; i <= width - 4; i += 4 )
1543
+ {
1544
+ __m128 f = _mm_load_ss(ky);
1545
+ f = _mm_shuffle_ps(f, f, 0);
1546
+ __m128 x0, s0 = _mm_load_ps(src[0] + i);
1547
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1548
+
1549
+ for( k = 1; k <= ksize2; k++ )
1550
+ {
1551
+ f = _mm_load_ss(ky+k);
1552
+ f = _mm_shuffle_ps(f, f, 0);
1553
+ S = src[k] + i;
1554
+ S2 = src[-k] + i;
1555
+ x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1556
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1557
+ }
1558
+
1559
+ _mm_storeu_ps(dst + i, s0);
1560
+ }
1561
+ }
1562
+ else
1563
+ {
1564
+ for( ; i <= width - 16; i += 16 )
1565
+ {
1566
+ __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1567
+ __m128 x0, x1;
1568
+ S = src[0] + i;
1569
+
1570
+ for( k = 1; k <= ksize2; k++ )
1571
+ {
1572
+ S = src[k] + i;
1573
+ S2 = src[-k] + i;
1574
+ f = _mm_load_ss(ky+k);
1575
+ f = _mm_shuffle_ps(f, f, 0);
1576
+ x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1577
+ x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1578
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1579
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1580
+ x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1581
+ x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1582
+ s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1583
+ s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1584
+ }
1585
+
1586
+ _mm_storeu_ps(dst + i, s0);
1587
+ _mm_storeu_ps(dst + i + 4, s1);
1588
+ _mm_storeu_ps(dst + i + 8, s2);
1589
+ _mm_storeu_ps(dst + i + 12, s3);
1590
+ }
1591
+
1592
+ for( ; i <= width - 4; i += 4 )
1593
+ {
1594
+ __m128 f, x0, s0 = d4;
1595
+
1596
+ for( k = 1; k <= ksize2; k++ )
1597
+ {
1598
+ f = _mm_load_ss(ky+k);
1599
+ f = _mm_shuffle_ps(f, f, 0);
1600
+ x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1601
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1602
+ }
1603
+
1604
+ _mm_storeu_ps(dst + i, s0);
1605
+ }
1606
+ }
1607
+
1608
+ return i;
1609
+ }
1610
+
1611
+ int symmetryType;
1612
+ float delta;
1613
+ Mat kernel;
1614
+ };
1615
+
1616
+
1617
+ struct SymmColumnSmallVec_32f
1618
+ {
1619
+ SymmColumnSmallVec_32f() { symmetryType=0; }
1620
+ SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1621
+ {
1622
+ symmetryType = _symmetryType;
1623
+ kernel = _kernel;
1624
+ delta = (float)_delta;
1625
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1626
+ }
1627
+
1628
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1629
+ {
1630
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1631
+ return 0;
1632
+
1633
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1634
+ const float* ky = (const float*)kernel.data + ksize2;
1635
+ int i = 0;
1636
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1637
+ const float** src = (const float**)_src;
1638
+ const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1639
+ float* dst = (float*)_dst;
1640
+ __m128 d4 = _mm_set1_ps(delta);
1641
+
1642
+ if( symmetrical )
1643
+ {
1644
+ if( ky[0] == 2 && ky[1] == 1 )
1645
+ {
1646
+ for( ; i <= width - 8; i += 8 )
1647
+ {
1648
+ __m128 s0, s1, s2, s3, s4, s5;
1649
+ s0 = _mm_load_ps(S0 + i);
1650
+ s1 = _mm_load_ps(S0 + i + 4);
1651
+ s2 = _mm_load_ps(S1 + i);
1652
+ s3 = _mm_load_ps(S1 + i + 4);
1653
+ s4 = _mm_load_ps(S2 + i);
1654
+ s5 = _mm_load_ps(S2 + i + 4);
1655
+ s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1656
+ s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1657
+ s0 = _mm_add_ps(s0, d4);
1658
+ s1 = _mm_add_ps(s1, d4);
1659
+ _mm_storeu_ps(dst + i, s0);
1660
+ _mm_storeu_ps(dst + i + 4, s1);
1661
+ }
1662
+ }
1663
+ else if( ky[0] == -2 && ky[1] == 1 )
1664
+ {
1665
+ for( ; i <= width - 8; i += 8 )
1666
+ {
1667
+ __m128 s0, s1, s2, s3, s4, s5;
1668
+ s0 = _mm_load_ps(S0 + i);
1669
+ s1 = _mm_load_ps(S0 + i + 4);
1670
+ s2 = _mm_load_ps(S1 + i);
1671
+ s3 = _mm_load_ps(S1 + i + 4);
1672
+ s4 = _mm_load_ps(S2 + i);
1673
+ s5 = _mm_load_ps(S2 + i + 4);
1674
+ s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1675
+ s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1676
+ s0 = _mm_add_ps(s0, d4);
1677
+ s1 = _mm_add_ps(s1, d4);
1678
+ _mm_storeu_ps(dst + i, s0);
1679
+ _mm_storeu_ps(dst + i + 4, s1);
1680
+ }
1681
+ }
1682
+ else
1683
+ {
1684
+ __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1685
+ for( ; i <= width - 8; i += 8 )
1686
+ {
1687
+ __m128 s0, s1, x0, x1;
1688
+ s0 = _mm_load_ps(S1 + i);
1689
+ s1 = _mm_load_ps(S1 + i + 4);
1690
+ s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1691
+ s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1692
+ x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1693
+ x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1694
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1695
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1696
+ _mm_storeu_ps(dst + i, s0);
1697
+ _mm_storeu_ps(dst + i + 4, s1);
1698
+ }
1699
+ }
1700
+ }
1701
+ else
1702
+ {
1703
+ if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1704
+ {
1705
+ if( ky[1] < 0 )
1706
+ std::swap(S0, S2);
1707
+ for( ; i <= width - 8; i += 8 )
1708
+ {
1709
+ __m128 s0, s1, s2, s3;
1710
+ s0 = _mm_load_ps(S2 + i);
1711
+ s1 = _mm_load_ps(S2 + i + 4);
1712
+ s2 = _mm_load_ps(S0 + i);
1713
+ s3 = _mm_load_ps(S0 + i + 4);
1714
+ s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1715
+ s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1716
+ _mm_storeu_ps(dst + i, s0);
1717
+ _mm_storeu_ps(dst + i + 4, s1);
1718
+ }
1719
+ }
1720
+ else
1721
+ {
1722
+ __m128 k1 = _mm_set1_ps(ky[1]);
1723
+ for( ; i <= width - 8; i += 8 )
1724
+ {
1725
+ __m128 s0 = d4, s1 = d4, x0, x1;
1726
+ x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1727
+ x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1728
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1729
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1730
+ _mm_storeu_ps(dst + i, s0);
1731
+ _mm_storeu_ps(dst + i + 4, s1);
1732
+ }
1733
+ }
1734
+ }
1735
+
1736
+ return i;
1737
+ }
1738
+
1739
+ int symmetryType;
1740
+ float delta;
1741
+ Mat kernel;
1742
+ };
1743
+
1744
+
1745
+ /////////////////////////////// non-separable filters ///////////////////////////////
1746
+
1747
+ ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1748
+
1749
+ struct FilterVec_8u
1750
+ {
1751
+ FilterVec_8u() {}
1752
+ FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1753
+ {
1754
+ Mat kernel;
1755
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1756
+ delta = (float)(_delta/(1 << _bits));
1757
+ vector<Point> coords;
1758
+ preprocess2DKernel(kernel, coords, coeffs);
1759
+ _nz = (int)coords.size();
1760
+ }
1761
+
1762
+ int operator()(const uchar** src, uchar* dst, int width) const
1763
+ {
1764
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
1765
+ return 0;
1766
+
1767
+ const float* kf = (const float*)&coeffs[0];
1768
+ int i = 0, k, nz = _nz;
1769
+ __m128 d4 = _mm_set1_ps(delta);
1770
+
1771
+ for( ; i <= width - 16; i += 16 )
1772
+ {
1773
+ __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1774
+ __m128i x0, x1, z = _mm_setzero_si128();
1775
+
1776
+ for( k = 0; k < nz; k++ )
1777
+ {
1778
+ __m128 f = _mm_load_ss(kf+k), t0, t1;
1779
+ f = _mm_shuffle_ps(f, f, 0);
1780
+
1781
+ x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1782
+ x1 = _mm_unpackhi_epi8(x0, z);
1783
+ x0 = _mm_unpacklo_epi8(x0, z);
1784
+
1785
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1786
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1787
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1788
+ s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1789
+
1790
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1791
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1792
+ s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1793
+ s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1794
+ }
1795
+
1796
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1797
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1798
+ x0 = _mm_packus_epi16(x0, x1);
1799
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1800
+ }
1801
+
1802
+ for( ; i <= width - 4; i += 4 )
1803
+ {
1804
+ __m128 s0 = d4;
1805
+ __m128i x0, z = _mm_setzero_si128();
1806
+
1807
+ for( k = 0; k < nz; k++ )
1808
+ {
1809
+ __m128 f = _mm_load_ss(kf+k), t0;
1810
+ f = _mm_shuffle_ps(f, f, 0);
1811
+
1812
+ x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1813
+ x0 = _mm_unpacklo_epi8(x0, z);
1814
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1815
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1816
+ }
1817
+
1818
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1819
+ x0 = _mm_packus_epi16(x0, x0);
1820
+ *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1821
+ }
1822
+
1823
+ return i;
1824
+ }
1825
+
1826
+ int _nz;
1827
+ vector<uchar> coeffs;
1828
+ float delta;
1829
+ };
1830
+
1831
+
1832
+ struct FilterVec_8u16s
1833
+ {
1834
+ FilterVec_8u16s() {}
1835
+ FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
1836
+ {
1837
+ Mat kernel;
1838
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1839
+ delta = (float)(_delta/(1 << _bits));
1840
+ vector<Point> coords;
1841
+ preprocess2DKernel(kernel, coords, coeffs);
1842
+ _nz = (int)coords.size();
1843
+ }
1844
+
1845
+ int operator()(const uchar** src, uchar* _dst, int width) const
1846
+ {
1847
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
1848
+ return 0;
1849
+
1850
+ const float* kf = (const float*)&coeffs[0];
1851
+ short* dst = (short*)_dst;
1852
+ int i = 0, k, nz = _nz;
1853
+ __m128 d4 = _mm_set1_ps(delta);
1854
+
1855
+ for( ; i <= width - 16; i += 16 )
1856
+ {
1857
+ __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1858
+ __m128i x0, x1, z = _mm_setzero_si128();
1859
+
1860
+ for( k = 0; k < nz; k++ )
1861
+ {
1862
+ __m128 f = _mm_load_ss(kf+k), t0, t1;
1863
+ f = _mm_shuffle_ps(f, f, 0);
1864
+
1865
+ x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1866
+ x1 = _mm_unpackhi_epi8(x0, z);
1867
+ x0 = _mm_unpacklo_epi8(x0, z);
1868
+
1869
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1870
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1871
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1872
+ s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1873
+
1874
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1875
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1876
+ s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1877
+ s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1878
+ }
1879
+
1880
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1881
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1882
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1883
+ _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
1884
+ }
1885
+
1886
+ for( ; i <= width - 4; i += 4 )
1887
+ {
1888
+ __m128 s0 = d4;
1889
+ __m128i x0, z = _mm_setzero_si128();
1890
+
1891
+ for( k = 0; k < nz; k++ )
1892
+ {
1893
+ __m128 f = _mm_load_ss(kf+k), t0;
1894
+ f = _mm_shuffle_ps(f, f, 0);
1895
+
1896
+ x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1897
+ x0 = _mm_unpacklo_epi8(x0, z);
1898
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1899
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1900
+ }
1901
+
1902
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1903
+ _mm_storel_epi64((__m128i*)(dst + i), x0);
1904
+ }
1905
+
1906
+ return i;
1907
+ }
1908
+
1909
+ int _nz;
1910
+ vector<uchar> coeffs;
1911
+ float delta;
1912
+ };
1913
+
1914
+
1915
+ struct FilterVec_32f
1916
+ {
1917
+ FilterVec_32f() {}
1918
+ FilterVec_32f(const Mat& _kernel, int, double _delta)
1919
+ {
1920
+ delta = (float)_delta;
1921
+ vector<Point> coords;
1922
+ preprocess2DKernel(_kernel, coords, coeffs);
1923
+ _nz = (int)coords.size();
1924
+ }
1925
+
1926
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1927
+ {
1928
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1929
+ return 0;
1930
+
1931
+ const float* kf = (const float*)&coeffs[0];
1932
+ const float** src = (const float**)_src;
1933
+ float* dst = (float*)_dst;
1934
+ int i = 0, k, nz = _nz;
1935
+ __m128 d4 = _mm_set1_ps(delta);
1936
+
1937
+ for( ; i <= width - 16; i += 16 )
1938
+ {
1939
+ __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1940
+
1941
+ for( k = 0; k < nz; k++ )
1942
+ {
1943
+ __m128 f = _mm_load_ss(kf+k), t0, t1;
1944
+ f = _mm_shuffle_ps(f, f, 0);
1945
+ const float* S = src[k] + i;
1946
+
1947
+ t0 = _mm_loadu_ps(S);
1948
+ t1 = _mm_loadu_ps(S + 4);
1949
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1950
+ s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1951
+
1952
+ t0 = _mm_loadu_ps(S + 8);
1953
+ t1 = _mm_loadu_ps(S + 12);
1954
+ s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1955
+ s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1956
+ }
1957
+
1958
+ _mm_storeu_ps(dst + i, s0);
1959
+ _mm_storeu_ps(dst + i + 4, s1);
1960
+ _mm_storeu_ps(dst + i + 8, s2);
1961
+ _mm_storeu_ps(dst + i + 12, s3);
1962
+ }
1963
+
1964
+ for( ; i <= width - 4; i += 4 )
1965
+ {
1966
+ __m128 s0 = d4;
1967
+
1968
+ for( k = 0; k < nz; k++ )
1969
+ {
1970
+ __m128 f = _mm_load_ss(kf+k), t0;
1971
+ f = _mm_shuffle_ps(f, f, 0);
1972
+ t0 = _mm_loadu_ps(src[k] + i);
1973
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1974
+ }
1975
+ _mm_storeu_ps(dst + i, s0);
1976
+ }
1977
+
1978
+ return i;
1979
+ }
1980
+
1981
+ int _nz;
1982
+ vector<uchar> coeffs;
1983
+ float delta;
1984
+ };
1985
+
1986
+
1987
+ #else
1988
+
1989
+ typedef RowNoVec RowVec_8u32s;
1990
+ typedef RowNoVec RowVec_32f;
1991
+ typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
1992
+ typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
1993
+ typedef ColumnNoVec SymmColumnVec_32s8u;
1994
+ typedef ColumnNoVec SymmColumnVec_32f;
1995
+ typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
1996
+ typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
1997
+ typedef FilterNoVec FilterVec_8u;
1998
+ typedef FilterNoVec FilterVec_8u16s;
1999
+ typedef FilterNoVec FilterVec_32f;
2000
+
2001
+ #endif
2002
+
2003
+
2004
+ template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
2005
+ {
2006
+ RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
2007
+ {
2008
+ if( _kernel.isContinuous() )
2009
+ kernel = _kernel;
2010
+ else
2011
+ _kernel.copyTo(kernel);
2012
+ anchor = _anchor;
2013
+ ksize = kernel.rows + kernel.cols - 1;
2014
+ CV_Assert( kernel.type() == DataType<DT>::type &&
2015
+ (kernel.rows == 1 || kernel.cols == 1));
2016
+ vecOp = _vecOp;
2017
+ }
2018
+
2019
+ void operator()(const uchar* src, uchar* dst, int width, int cn)
2020
+ {
2021
+ int _ksize = ksize;
2022
+ const DT* kx = (const DT*)kernel.data;
2023
+ const ST* S;
2024
+ DT* D = (DT*)dst;
2025
+ int i, k;
2026
+
2027
+ i = vecOp(src, dst, width, cn);
2028
+ width *= cn;
2029
+
2030
+ for( ; i <= width - 4; i += 4 )
2031
+ {
2032
+ S = (const ST*)src + i;
2033
+ DT f = kx[0];
2034
+ DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
2035
+
2036
+ for( k = 1; k < _ksize; k++ )
2037
+ {
2038
+ S += cn;
2039
+ f = kx[k];
2040
+ s0 += f*S[0]; s1 += f*S[1];
2041
+ s2 += f*S[2]; s3 += f*S[3];
2042
+ }
2043
+
2044
+ D[i] = s0; D[i+1] = s1;
2045
+ D[i+2] = s2; D[i+3] = s3;
2046
+ }
2047
+
2048
+ for( ; i < width; i++ )
2049
+ {
2050
+ S = (const ST*)src + i;
2051
+ DT s0 = kx[0]*S[0];
2052
+ for( k = 1; k < _ksize; k++ )
2053
+ {
2054
+ S += cn;
2055
+ s0 += kx[k]*S[0];
2056
+ }
2057
+ D[i] = s0;
2058
+ }
2059
+ }
2060
+
2061
+ Mat kernel;
2062
+ VecOp vecOp;
2063
+ };
2064
+
2065
+
2066
+ template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
2067
+ public RowFilter<ST, DT, VecOp>
2068
+ {
2069
+ SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
2070
+ const VecOp& _vecOp = VecOp())
2071
+ : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
2072
+ {
2073
+ symmetryType = _symmetryType;
2074
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
2075
+ }
2076
+
2077
+ void operator()(const uchar* src, uchar* dst, int width, int cn)
2078
+ {
2079
+ int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
2080
+ const DT* kx = (const DT*)this->kernel.data + ksize2;
2081
+ bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2082
+ DT* D = (DT*)dst;
2083
+ int i = this->vecOp(src, dst, width, cn), j, k;
2084
+ const ST* S = (const ST*)src + i + ksize2n;
2085
+ width *= cn;
2086
+
2087
+ if( symmetrical )
2088
+ {
2089
+ if( this->ksize == 1 && kx[0] == 1 )
2090
+ {
2091
+ for( ; i <= width - 2; i += 2 )
2092
+ {
2093
+ DT s0 = S[i], s1 = S[i+1];
2094
+ D[i] = s0; D[i+1] = s1;
2095
+ }
2096
+ S += i;
2097
+ }
2098
+ else if( this->ksize == 3 )
2099
+ {
2100
+ if( kx[0] == 2 && kx[1] == 1 )
2101
+ for( ; i <= width - 2; i += 2, S += 2 )
2102
+ {
2103
+ DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
2104
+ D[i] = s0; D[i+1] = s1;
2105
+ }
2106
+ else if( kx[0] == -2 && kx[1] == 1 )
2107
+ for( ; i <= width - 2; i += 2, S += 2 )
2108
+ {
2109
+ DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
2110
+ D[i] = s0; D[i+1] = s1;
2111
+ }
2112
+ else
2113
+ {
2114
+ DT k0 = kx[0], k1 = kx[1];
2115
+ for( ; i <= width - 2; i += 2, S += 2 )
2116
+ {
2117
+ DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
2118
+ D[i] = s0; D[i+1] = s1;
2119
+ }
2120
+ }
2121
+ }
2122
+ else if( this->ksize == 5 )
2123
+ {
2124
+ DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
2125
+ if( k0 == -2 && k1 == 0 && k2 == 1 )
2126
+ for( ; i <= width - 2; i += 2, S += 2 )
2127
+ {
2128
+ DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
2129
+ DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
2130
+ D[i] = s0; D[i+1] = s1;
2131
+ }
2132
+ else
2133
+ for( ; i <= width - 2; i += 2, S += 2 )
2134
+ {
2135
+ DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
2136
+ DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
2137
+ D[i] = s0; D[i+1] = s1;
2138
+ }
2139
+ }
2140
+
2141
+ for( ; i < width; i++, S++ )
2142
+ {
2143
+ DT s0 = kx[0]*S[0];
2144
+ for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2145
+ s0 += kx[k]*(S[j] + S[-j]);
2146
+ D[i] = s0;
2147
+ }
2148
+ }
2149
+ else
2150
+ {
2151
+ if( this->ksize == 3 )
2152
+ {
2153
+ if( kx[0] == 0 && kx[1] == 1 )
2154
+ for( ; i <= width - 2; i += 2, S += 2 )
2155
+ {
2156
+ DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
2157
+ D[i] = s0; D[i+1] = s1;
2158
+ }
2159
+ else
2160
+ {
2161
+ DT k1 = kx[1];
2162
+ for( ; i <= width - 2; i += 2, S += 2 )
2163
+ {
2164
+ DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
2165
+ D[i] = s0; D[i+1] = s1;
2166
+ }
2167
+ }
2168
+ }
2169
+ else if( this->ksize == 5 )
2170
+ {
2171
+ DT k1 = kx[1], k2 = kx[2];
2172
+ for( ; i <= width - 2; i += 2, S += 2 )
2173
+ {
2174
+ DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
2175
+ DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
2176
+ D[i] = s0; D[i+1] = s1;
2177
+ }
2178
+ }
2179
+
2180
+ for( ; i < width; i++, S++ )
2181
+ {
2182
+ DT s0 = kx[0]*S[0];
2183
+ for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2184
+ s0 += kx[k]*(S[j] - S[-j]);
2185
+ D[i] = s0;
2186
+ }
2187
+ }
2188
+ }
2189
+
2190
+ int symmetryType;
2191
+ };
2192
+
2193
+
2194
+ template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
2195
+ {
2196
+ typedef typename CastOp::type1 ST;
2197
+ typedef typename CastOp::rtype DT;
2198
+
2199
+ ColumnFilter( const Mat& _kernel, int _anchor,
2200
+ double _delta, const CastOp& _castOp=CastOp(),
2201
+ const VecOp& _vecOp=VecOp() )
2202
+ {
2203
+ if( _kernel.isContinuous() )
2204
+ kernel = _kernel;
2205
+ else
2206
+ _kernel.copyTo(kernel);
2207
+ anchor = _anchor;
2208
+ ksize = kernel.rows + kernel.cols - 1;
2209
+ delta = saturate_cast<ST>(_delta);
2210
+ castOp0 = _castOp;
2211
+ vecOp = _vecOp;
2212
+ CV_Assert( kernel.type() == DataType<ST>::type &&
2213
+ (kernel.rows == 1 || kernel.cols == 1));
2214
+ }
2215
+
2216
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2217
+ {
2218
+ const ST* ky = (const ST*)kernel.data;
2219
+ ST _delta = delta;
2220
+ int _ksize = ksize;
2221
+ int i, k;
2222
+ CastOp castOp = castOp0;
2223
+
2224
+ for( ; count--; dst += dststep, src++ )
2225
+ {
2226
+ DT* D = (DT*)dst;
2227
+ i = vecOp(src, dst, width);
2228
+ for( ; i <= width - 4; i += 4 )
2229
+ {
2230
+ ST f = ky[0];
2231
+ const ST* S = (const ST*)src[0] + i;
2232
+ ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2233
+ s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2234
+
2235
+ for( k = 1; k < _ksize; k++ )
2236
+ {
2237
+ S = (const ST*)src[k] + i; f = ky[k];
2238
+ s0 += f*S[0]; s1 += f*S[1];
2239
+ s2 += f*S[2]; s3 += f*S[3];
2240
+ }
2241
+
2242
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2243
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2244
+ }
2245
+
2246
+ for( ; i < width; i++ )
2247
+ {
2248
+ ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2249
+ for( k = 1; k < _ksize; k++ )
2250
+ s0 += ky[k]*((const ST*)src[k])[i];
2251
+ D[i] = castOp(s0);
2252
+ }
2253
+ }
2254
+ }
2255
+
2256
+ Mat kernel;
2257
+ CastOp castOp0;
2258
+ VecOp vecOp;
2259
+ ST delta;
2260
+ };
2261
+
2262
+
2263
+ template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
2264
+ {
2265
+ typedef typename CastOp::type1 ST;
2266
+ typedef typename CastOp::rtype DT;
2267
+
2268
+ SymmColumnFilter( const Mat& _kernel, int _anchor,
2269
+ double _delta, int _symmetryType,
2270
+ const CastOp& _castOp=CastOp(),
2271
+ const VecOp& _vecOp=VecOp())
2272
+ : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
2273
+ {
2274
+ symmetryType = _symmetryType;
2275
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2276
+ }
2277
+
2278
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2279
+ {
2280
+ int ksize2 = this->ksize/2;
2281
+ const ST* ky = (const ST*)this->kernel.data + ksize2;
2282
+ int i, k;
2283
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2284
+ ST _delta = this->delta;
2285
+ CastOp castOp = this->castOp0;
2286
+ src += ksize2;
2287
+
2288
+ if( symmetrical )
2289
+ {
2290
+ for( ; count--; dst += dststep, src++ )
2291
+ {
2292
+ DT* D = (DT*)dst;
2293
+ i = (this->vecOp)(src, dst, width);
2294
+
2295
+ for( ; i <= width - 4; i += 4 )
2296
+ {
2297
+ ST f = ky[0];
2298
+ const ST* S = (const ST*)src[0] + i, *S2;
2299
+ ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2300
+ s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2301
+
2302
+ for( k = 1; k <= ksize2; k++ )
2303
+ {
2304
+ S = (const ST*)src[k] + i;
2305
+ S2 = (const ST*)src[-k] + i;
2306
+ f = ky[k];
2307
+ s0 += f*(S[0] + S2[0]);
2308
+ s1 += f*(S[1] + S2[1]);
2309
+ s2 += f*(S[2] + S2[2]);
2310
+ s3 += f*(S[3] + S2[3]);
2311
+ }
2312
+
2313
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2314
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2315
+ }
2316
+
2317
+ for( ; i < width; i++ )
2318
+ {
2319
+ ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2320
+ for( k = 1; k <= ksize2; k++ )
2321
+ s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
2322
+ D[i] = castOp(s0);
2323
+ }
2324
+ }
2325
+ }
2326
+ else
2327
+ {
2328
+ for( ; count--; dst += dststep, src++ )
2329
+ {
2330
+ DT* D = (DT*)dst;
2331
+ i = this->vecOp(src, dst, width);
2332
+
2333
+ for( ; i <= width - 4; i += 4 )
2334
+ {
2335
+ ST f = ky[0];
2336
+ const ST *S, *S2;
2337
+ ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2338
+
2339
+ for( k = 1; k <= ksize2; k++ )
2340
+ {
2341
+ S = (const ST*)src[k] + i;
2342
+ S2 = (const ST*)src[-k] + i;
2343
+ f = ky[k];
2344
+ s0 += f*(S[0] - S2[0]);
2345
+ s1 += f*(S[1] - S2[1]);
2346
+ s2 += f*(S[2] - S2[2]);
2347
+ s3 += f*(S[3] - S2[3]);
2348
+ }
2349
+
2350
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2351
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2352
+ }
2353
+
2354
+ for( ; i < width; i++ )
2355
+ {
2356
+ ST s0 = _delta;
2357
+ for( k = 1; k <= ksize2; k++ )
2358
+ s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
2359
+ D[i] = castOp(s0);
2360
+ }
2361
+ }
2362
+ }
2363
+ }
2364
+
2365
+ int symmetryType;
2366
+ };
2367
+
2368
+
2369
+ template<class CastOp, class VecOp>
2370
+ struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
2371
+ {
2372
+ typedef typename CastOp::type1 ST;
2373
+ typedef typename CastOp::rtype DT;
2374
+
2375
+ SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
2376
+ double _delta, int _symmetryType,
2377
+ const CastOp& _castOp=CastOp(),
2378
+ const VecOp& _vecOp=VecOp())
2379
+ : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
2380
+ {
2381
+ CV_Assert( this->ksize == 3 );
2382
+ }
2383
+
2384
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2385
+ {
2386
+ int ksize2 = this->ksize/2;
2387
+ const ST* ky = (const ST*)this->kernel.data + ksize2;
2388
+ int i;
2389
+ bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2390
+ bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
2391
+ bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
2392
+ bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
2393
+ ST f0 = ky[0], f1 = ky[1];
2394
+ ST _delta = this->delta;
2395
+ CastOp castOp = this->castOp0;
2396
+ src += ksize2;
2397
+
2398
+ for( ; count--; dst += dststep, src++ )
2399
+ {
2400
+ DT* D = (DT*)dst;
2401
+ i = (this->vecOp)(src, dst, width);
2402
+ const ST* S0 = (const ST*)src[-1];
2403
+ const ST* S1 = (const ST*)src[0];
2404
+ const ST* S2 = (const ST*)src[1];
2405
+
2406
+ if( symmetrical )
2407
+ {
2408
+ if( is_1_2_1 )
2409
+ {
2410
+ for( ; i <= width - 4; i += 4 )
2411
+ {
2412
+ ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2413
+ ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
2414
+ D[i] = castOp(s0);
2415
+ D[i+1] = castOp(s1);
2416
+
2417
+ s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
2418
+ s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
2419
+ D[i+2] = castOp(s0);
2420
+ D[i+3] = castOp(s1);
2421
+ }
2422
+ }
2423
+ else if( is_1_m2_1 )
2424
+ {
2425
+ for( ; i <= width - 4; i += 4 )
2426
+ {
2427
+ ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2428
+ ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
2429
+ D[i] = castOp(s0);
2430
+ D[i+1] = castOp(s1);
2431
+
2432
+ s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
2433
+ s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
2434
+ D[i+2] = castOp(s0);
2435
+ D[i+3] = castOp(s1);
2436
+ }
2437
+ }
2438
+ else
2439
+ {
2440
+ for( ; i <= width - 4; i += 4 )
2441
+ {
2442
+ ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2443
+ ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
2444
+ D[i] = castOp(s0);
2445
+ D[i+1] = castOp(s1);
2446
+
2447
+ s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
2448
+ s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
2449
+ D[i+2] = castOp(s0);
2450
+ D[i+3] = castOp(s1);
2451
+ }
2452
+ }
2453
+
2454
+ for( ; i < width; i++ )
2455
+ D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta);
2456
+ }
2457
+ else
2458
+ {
2459
+ if( is_m1_0_1 )
2460
+ {
2461
+ if( f1 < 0 )
2462
+ std::swap(S0, S2);
2463
+
2464
+ for( ; i <= width - 4; i += 4 )
2465
+ {
2466
+ ST s0 = S2[i] - S0[i] + _delta;
2467
+ ST s1 = S2[i+1] - S0[i+1] + _delta;
2468
+ D[i] = castOp(s0);
2469
+ D[i+1] = castOp(s1);
2470
+
2471
+ s0 = S2[i+2] - S0[i+2] + _delta;
2472
+ s1 = S2[i+3] - S0[i+3] + _delta;
2473
+ D[i+2] = castOp(s0);
2474
+ D[i+3] = castOp(s1);
2475
+ }
2476
+
2477
+ if( f1 < 0 )
2478
+ std::swap(S0, S2);
2479
+ }
2480
+ else
2481
+ {
2482
+ for( ; i <= width - 4; i += 4 )
2483
+ {
2484
+ ST s0 = (S2[i] - S0[i])*f1 + _delta;
2485
+ ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
2486
+ D[i] = castOp(s0);
2487
+ D[i+1] = castOp(s1);
2488
+
2489
+ s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
2490
+ s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
2491
+ D[i+2] = castOp(s0);
2492
+ D[i+3] = castOp(s1);
2493
+ }
2494
+ }
2495
+
2496
+ for( ; i < width; i++ )
2497
+ D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
2498
+ }
2499
+ }
2500
+ }
2501
+ };
2502
+
2503
+ template<typename ST, typename DT> struct Cast
2504
+ {
2505
+ typedef ST type1;
2506
+ typedef DT rtype;
2507
+
2508
+ DT operator()(ST val) const { return saturate_cast<DT>(val); }
2509
+ };
2510
+
2511
+ template<typename ST, typename DT, int bits> struct FixedPtCast
2512
+ {
2513
+ typedef ST type1;
2514
+ typedef DT rtype;
2515
+ enum { SHIFT = bits, DELTA = 1 << (bits-1) };
2516
+
2517
+ DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2518
+ };
2519
+
2520
+ template<typename ST, typename DT> struct FixedPtCastEx
2521
+ {
2522
+ typedef ST type1;
2523
+ typedef DT rtype;
2524
+
2525
+ FixedPtCastEx() : SHIFT(0), DELTA(0) {}
2526
+ FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
2527
+ DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2528
+ int SHIFT, DELTA;
2529
+ };
2530
+
2531
+ }
2532
+
2533
+ cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
2534
+ InputArray _kernel, int anchor,
2535
+ int symmetryType )
2536
+ {
2537
+ Mat kernel = _kernel.getMat();
2538
+ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
2539
+ int cn = CV_MAT_CN(srcType);
2540
+ CV_Assert( cn == CV_MAT_CN(bufType) &&
2541
+ ddepth >= std::max(sdepth, CV_32S) &&
2542
+ kernel.type() == ddepth );
2543
+ int ksize = kernel.rows + kernel.cols - 1;
2544
+
2545
+ if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
2546
+ {
2547
+ if( sdepth == CV_8U && ddepth == CV_32S )
2548
+ return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
2549
+ (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
2550
+ if( sdepth == CV_32F && ddepth == CV_32F )
2551
+ return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
2552
+ (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
2553
+ }
2554
+
2555
+ if( sdepth == CV_8U && ddepth == CV_32S )
2556
+ return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
2557
+ (kernel, anchor, RowVec_8u32s(kernel)));
2558
+ if( sdepth == CV_8U && ddepth == CV_32F )
2559
+ return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
2560
+ if( sdepth == CV_8U && ddepth == CV_64F )
2561
+ return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
2562
+ if( sdepth == CV_16U && ddepth == CV_32F )
2563
+ return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
2564
+ if( sdepth == CV_16U && ddepth == CV_64F )
2565
+ return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
2566
+ if( sdepth == CV_16S && ddepth == CV_32F )
2567
+ return Ptr<BaseRowFilter>(new RowFilter<short, float, RowNoVec>(kernel, anchor));
2568
+ if( sdepth == CV_16S && ddepth == CV_64F )
2569
+ return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
2570
+ if( sdepth == CV_32F && ddepth == CV_32F )
2571
+ return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
2572
+ (kernel, anchor, RowVec_32f(kernel)));
2573
+ if( sdepth == CV_64F && ddepth == CV_64F )
2574
+ return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
2575
+
2576
+ CV_Error_( CV_StsNotImplemented,
2577
+ ("Unsupported combination of source format (=%d), and buffer format (=%d)",
2578
+ srcType, bufType));
2579
+
2580
+ return Ptr<BaseRowFilter>(0);
2581
+ }
2582
+
2583
+
2584
+ cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
2585
+ InputArray _kernel, int anchor,
2586
+ int symmetryType, double delta,
2587
+ int bits )
2588
+ {
2589
+ Mat kernel = _kernel.getMat();
2590
+ int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
2591
+ int cn = CV_MAT_CN(dstType);
2592
+ CV_Assert( cn == CV_MAT_CN(bufType) &&
2593
+ sdepth >= std::max(ddepth, CV_32S) &&
2594
+ kernel.type() == sdepth );
2595
+
2596
+ if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
2597
+ {
2598
+ if( ddepth == CV_8U && sdepth == CV_32S )
2599
+ return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
2600
+ (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
2601
+ if( ddepth == CV_8U && sdepth == CV_32F )
2602
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
2603
+ if( ddepth == CV_8U && sdepth == CV_64F )
2604
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
2605
+ if( ddepth == CV_16U && sdepth == CV_32F )
2606
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
2607
+ if( ddepth == CV_16U && sdepth == CV_64F )
2608
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
2609
+ if( ddepth == CV_16S && sdepth == CV_32F )
2610
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
2611
+ if( ddepth == CV_16S && sdepth == CV_64F )
2612
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
2613
+ if( ddepth == CV_32F && sdepth == CV_32F )
2614
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
2615
+ if( ddepth == CV_64F && sdepth == CV_64F )
2616
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
2617
+ }
2618
+ else
2619
+ {
2620
+ int ksize = kernel.rows + kernel.cols - 1;
2621
+ if( ksize == 3 )
2622
+ {
2623
+ if( ddepth == CV_8U && sdepth == CV_32S )
2624
+ return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2625
+ FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2626
+ (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2627
+ SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2628
+ if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
2629
+ return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<Cast<int, short>,
2630
+ SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType,
2631
+ Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
2632
+ if( ddepth == CV_32F && sdepth == CV_32F )
2633
+ return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2634
+ Cast<float, float>,SymmColumnSmallVec_32f>
2635
+ (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2636
+ SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
2637
+ }
2638
+ if( ddepth == CV_8U && sdepth == CV_32S )
2639
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2640
+ (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2641
+ SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2642
+ if( ddepth == CV_8U && sdepth == CV_32F )
2643
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
2644
+ (kernel, anchor, delta, symmetryType));
2645
+ if( ddepth == CV_8U && sdepth == CV_64F )
2646
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
2647
+ (kernel, anchor, delta, symmetryType));
2648
+ if( ddepth == CV_16U && sdepth == CV_32F )
2649
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
2650
+ (kernel, anchor, delta, symmetryType));
2651
+ if( ddepth == CV_16U && sdepth == CV_64F )
2652
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
2653
+ (kernel, anchor, delta, symmetryType));
2654
+ if( ddepth == CV_16S && sdepth == CV_32S )
2655
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
2656
+ (kernel, anchor, delta, symmetryType));
2657
+ if( ddepth == CV_16S && sdepth == CV_32F )
2658
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, ColumnNoVec>
2659
+ (kernel, anchor, delta, symmetryType));
2660
+ if( ddepth == CV_16S && sdepth == CV_64F )
2661
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
2662
+ (kernel, anchor, delta, symmetryType));
2663
+ if( ddepth == CV_32F && sdepth == CV_32F )
2664
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
2665
+ (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2666
+ SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
2667
+ if( ddepth == CV_64F && sdepth == CV_64F )
2668
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
2669
+ (kernel, anchor, delta, symmetryType));
2670
+ }
2671
+
2672
+ CV_Error_( CV_StsNotImplemented,
2673
+ ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
2674
+ bufType, dstType));
2675
+
2676
+ return Ptr<BaseColumnFilter>(0);
2677
+ }
2678
+
2679
+
2680
+ cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
2681
+ int _srcType, int _dstType,
2682
+ InputArray __rowKernel, InputArray __columnKernel,
2683
+ Point _anchor, double _delta,
2684
+ int _rowBorderType, int _columnBorderType,
2685
+ const Scalar& _borderValue )
2686
+ {
2687
+ Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
2688
+ _srcType = CV_MAT_TYPE(_srcType);
2689
+ _dstType = CV_MAT_TYPE(_dstType);
2690
+ int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2691
+ int cn = CV_MAT_CN(_srcType);
2692
+ CV_Assert( cn == CV_MAT_CN(_dstType) );
2693
+ int rsize = _rowKernel.rows + _rowKernel.cols - 1;
2694
+ int csize = _columnKernel.rows + _columnKernel.cols - 1;
2695
+ if( _anchor.x < 0 )
2696
+ _anchor.x = rsize/2;
2697
+ if( _anchor.y < 0 )
2698
+ _anchor.y = csize/2;
2699
+ int rtype = getKernelType(_rowKernel,
2700
+ _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
2701
+ int ctype = getKernelType(_columnKernel,
2702
+ _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
2703
+ Mat rowKernel, columnKernel;
2704
+
2705
+ int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
2706
+ int bits = 0;
2707
+
2708
+ if( sdepth == CV_8U &&
2709
+ ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2710
+ ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2711
+ ddepth == CV_8U) ||
2712
+ ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2713
+ (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2714
+ (rtype & ctype & KERNEL_INTEGER) &&
2715
+ ddepth == CV_16S)) )
2716
+ {
2717
+ bdepth = CV_32S;
2718
+ bits = ddepth == CV_8U ? 8 : 0;
2719
+ _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
2720
+ _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
2721
+ bits *= 2;
2722
+ _delta *= (1 << bits);
2723
+ }
2724
+ else
2725
+ {
2726
+ if( _rowKernel.type() != bdepth )
2727
+ _rowKernel.convertTo( rowKernel, bdepth );
2728
+ else
2729
+ rowKernel = _rowKernel;
2730
+ if( _columnKernel.type() != bdepth )
2731
+ _columnKernel.convertTo( columnKernel, bdepth );
2732
+ else
2733
+ columnKernel = _columnKernel;
2734
+ }
2735
+
2736
+ int _bufType = CV_MAKETYPE(bdepth, cn);
2737
+ Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
2738
+ _srcType, _bufType, rowKernel, _anchor.x, rtype);
2739
+ Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
2740
+ _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
2741
+
2742
+ return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
2743
+ _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
2744
+ }
2745
+
2746
+
2747
+ /****************************************************************************************\
2748
+ * Non-separable linear filter *
2749
+ \****************************************************************************************/
2750
+
2751
+ namespace cv
2752
+ {
2753
+
2754
+ void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs )
2755
+ {
2756
+ int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
2757
+ if(nz == 0)
2758
+ nz = 1;
2759
+ CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
2760
+ coords.resize(nz);
2761
+ coeffs.resize(nz*getElemSize(ktype));
2762
+ uchar* _coeffs = &coeffs[0];
2763
+
2764
+ for( i = k = 0; i < kernel.rows; i++ )
2765
+ {
2766
+ const uchar* krow = kernel.data + kernel.step*i;
2767
+ for( j = 0; j < kernel.cols; j++ )
2768
+ {
2769
+ if( ktype == CV_8U )
2770
+ {
2771
+ uchar val = krow[j];
2772
+ if( val == 0 )
2773
+ continue;
2774
+ coords[k] = Point(j,i);
2775
+ _coeffs[k++] = val;
2776
+ }
2777
+ else if( ktype == CV_32S )
2778
+ {
2779
+ int val = ((const int*)krow)[j];
2780
+ if( val == 0 )
2781
+ continue;
2782
+ coords[k] = Point(j,i);
2783
+ ((int*)_coeffs)[k++] = val;
2784
+ }
2785
+ else if( ktype == CV_32F )
2786
+ {
2787
+ float val = ((const float*)krow)[j];
2788
+ if( val == 0 )
2789
+ continue;
2790
+ coords[k] = Point(j,i);
2791
+ ((float*)_coeffs)[k++] = val;
2792
+ }
2793
+ else
2794
+ {
2795
+ double val = ((const double*)krow)[j];
2796
+ if( val == 0 )
2797
+ continue;
2798
+ coords[k] = Point(j,i);
2799
+ ((double*)_coeffs)[k++] = val;
2800
+ }
2801
+ }
2802
+ }
2803
+ }
2804
+
2805
+
2806
+ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
2807
+ {
2808
+ typedef typename CastOp::type1 KT;
2809
+ typedef typename CastOp::rtype DT;
2810
+
2811
+ Filter2D( const Mat& _kernel, Point _anchor,
2812
+ double _delta, const CastOp& _castOp=CastOp(),
2813
+ const VecOp& _vecOp=VecOp() )
2814
+ {
2815
+ anchor = _anchor;
2816
+ ksize = _kernel.size();
2817
+ delta = saturate_cast<KT>(_delta);
2818
+ castOp0 = _castOp;
2819
+ vecOp = _vecOp;
2820
+ CV_Assert( _kernel.type() == DataType<KT>::type );
2821
+ preprocess2DKernel( _kernel, coords, coeffs );
2822
+ ptrs.resize( coords.size() );
2823
+ }
2824
+
2825
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
2826
+ {
2827
+ KT _delta = delta;
2828
+ const Point* pt = &coords[0];
2829
+ const KT* kf = (const KT*)&coeffs[0];
2830
+ const ST** kp = (const ST**)&ptrs[0];
2831
+ int i, k, nz = (int)coords.size();
2832
+ CastOp castOp = castOp0;
2833
+
2834
+ width *= cn;
2835
+ for( ; count > 0; count--, dst += dststep, src++ )
2836
+ {
2837
+ DT* D = (DT*)dst;
2838
+
2839
+ for( k = 0; k < nz; k++ )
2840
+ kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
2841
+
2842
+ i = vecOp((const uchar**)kp, dst, width);
2843
+
2844
+ for( ; i <= width - 4; i += 4 )
2845
+ {
2846
+ KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2847
+
2848
+ for( k = 0; k < nz; k++ )
2849
+ {
2850
+ const ST* sptr = kp[k] + i;
2851
+ KT f = kf[k];
2852
+ s0 += f*sptr[0];
2853
+ s1 += f*sptr[1];
2854
+ s2 += f*sptr[2];
2855
+ s3 += f*sptr[3];
2856
+ }
2857
+
2858
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2859
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2860
+ }
2861
+
2862
+ for( ; i < width; i++ )
2863
+ {
2864
+ KT s0 = _delta;
2865
+ for( k = 0; k < nz; k++ )
2866
+ s0 += kf[k]*kp[k][i];
2867
+ D[i] = castOp(s0);
2868
+ }
2869
+ }
2870
+ }
2871
+
2872
+ vector<Point> coords;
2873
+ vector<uchar> coeffs;
2874
+ vector<uchar*> ptrs;
2875
+ KT delta;
2876
+ CastOp castOp0;
2877
+ VecOp vecOp;
2878
+ };
2879
+
2880
+ }
2881
+
2882
+ cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
2883
+ InputArray filter_kernel, Point anchor,
2884
+ double delta, int bits)
2885
+ {
2886
+ Mat _kernel = filter_kernel.getMat();
2887
+ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
2888
+ int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
2889
+ CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
2890
+
2891
+ anchor = normalizeAnchor(anchor, _kernel.size());
2892
+
2893
+ /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
2894
+ return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
2895
+ (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
2896
+ FilterVec_8u(_kernel, bits, delta)));
2897
+ if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
2898
+ return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
2899
+ (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
2900
+ FilterVec_8u16s(_kernel, bits, delta)));*/
2901
+
2902
+ kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
2903
+ Mat kernel;
2904
+ if( _kernel.type() == kdepth )
2905
+ kernel = _kernel;
2906
+ else
2907
+ _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
2908
+
2909
+ if( sdepth == CV_8U && ddepth == CV_8U )
2910
+ return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
2911
+ (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
2912
+ if( sdepth == CV_8U && ddepth == CV_16U )
2913
+ return Ptr<BaseFilter>(new Filter2D<uchar,
2914
+ Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2915
+ if( sdepth == CV_8U && ddepth == CV_16S )
2916
+ return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
2917
+ (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
2918
+ if( sdepth == CV_8U && ddepth == CV_32F )
2919
+ return Ptr<BaseFilter>(new Filter2D<uchar,
2920
+ Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2921
+ if( sdepth == CV_8U && ddepth == CV_64F )
2922
+ return Ptr<BaseFilter>(new Filter2D<uchar,
2923
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2924
+
2925
+ if( sdepth == CV_16U && ddepth == CV_16U )
2926
+ return Ptr<BaseFilter>(new Filter2D<ushort,
2927
+ Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2928
+ if( sdepth == CV_16U && ddepth == CV_32F )
2929
+ return Ptr<BaseFilter>(new Filter2D<ushort,
2930
+ Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2931
+ if( sdepth == CV_16U && ddepth == CV_64F )
2932
+ return Ptr<BaseFilter>(new Filter2D<ushort,
2933
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2934
+
2935
+ if( sdepth == CV_16S && ddepth == CV_16S )
2936
+ return Ptr<BaseFilter>(new Filter2D<short,
2937
+ Cast<float, short>, FilterNoVec>(kernel, anchor, delta));
2938
+ if( sdepth == CV_16S && ddepth == CV_32F )
2939
+ return Ptr<BaseFilter>(new Filter2D<short,
2940
+ Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2941
+ if( sdepth == CV_16S && ddepth == CV_64F )
2942
+ return Ptr<BaseFilter>(new Filter2D<short,
2943
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2944
+
2945
+ if( sdepth == CV_32F && ddepth == CV_32F )
2946
+ return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
2947
+ (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
2948
+ if( sdepth == CV_64F && ddepth == CV_64F )
2949
+ return Ptr<BaseFilter>(new Filter2D<double,
2950
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2951
+
2952
+ CV_Error_( CV_StsNotImplemented,
2953
+ ("Unsupported combination of source format (=%d), and destination format (=%d)",
2954
+ srcType, dstType));
2955
+
2956
+ return Ptr<BaseFilter>(0);
2957
+ }
2958
+
2959
+
2960
+ cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
2961
+ InputArray filter_kernel,
2962
+ Point _anchor, double _delta,
2963
+ int _rowBorderType, int _columnBorderType,
2964
+ const Scalar& _borderValue )
2965
+ {
2966
+ Mat _kernel = filter_kernel.getMat();
2967
+ _srcType = CV_MAT_TYPE(_srcType);
2968
+ _dstType = CV_MAT_TYPE(_dstType);
2969
+ int cn = CV_MAT_CN(_srcType);
2970
+ CV_Assert( cn == CV_MAT_CN(_dstType) );
2971
+
2972
+ Mat kernel = _kernel;
2973
+ int bits = 0;
2974
+
2975
+ /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2976
+ int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
2977
+ if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
2978
+ _kernel.rows*_kernel.cols <= (1 << 10) )
2979
+ {
2980
+ bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
2981
+ _kernel.convertTo(kernel, CV_32S, 1 << bits);
2982
+ }*/
2983
+
2984
+ Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
2985
+ kernel, _anchor, _delta, bits);
2986
+
2987
+ return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
2988
+ Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
2989
+ _rowBorderType, _columnBorderType, _borderValue ));
2990
+ }
2991
+
2992
+
2993
+ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
2994
+ InputArray _kernel, Point anchor,
2995
+ double delta, int borderType )
2996
+ {
2997
+ Mat src = _src.getMat(), kernel = _kernel.getMat();
2998
+
2999
+ if( ddepth < 0 )
3000
+ ddepth = src.depth();
3001
+
3002
+ #if CV_SSE2
3003
+ int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
3004
+ (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
3005
+ #else
3006
+ int dft_filter_size = 50;
3007
+ #endif
3008
+
3009
+ _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3010
+ Mat dst = _dst.getMat();
3011
+ anchor = normalizeAnchor(anchor, kernel.size());
3012
+
3013
+ if( kernel.cols*kernel.rows >= dft_filter_size )
3014
+ {
3015
+ Mat temp;
3016
+ if( src.data != dst.data )
3017
+ temp = dst;
3018
+ else
3019
+ temp.create(dst.size(), dst.type());
3020
+ crossCorr( src, kernel, temp, src.size(),
3021
+ CV_MAKETYPE(ddepth, src.channels()),
3022
+ anchor, delta, borderType );
3023
+ if( temp.data != dst.data )
3024
+ temp.copyTo(dst);
3025
+ return;
3026
+ }
3027
+
3028
+ Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
3029
+ anchor, delta, borderType );
3030
+ f->apply(src, dst);
3031
+ }
3032
+
3033
+
3034
+ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
3035
+ InputArray _kernelX, InputArray _kernelY, Point anchor,
3036
+ double delta, int borderType )
3037
+ {
3038
+ Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
3039
+
3040
+ if( ddepth < 0 )
3041
+ ddepth = src.depth();
3042
+
3043
+ _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3044
+ Mat dst = _dst.getMat();
3045
+
3046
+ Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
3047
+ dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
3048
+ f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
3049
+ }
3050
+
3051
+
3052
+ CV_IMPL void
3053
+ cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
3054
+ {
3055
+ cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3056
+ cv::Mat kernel = cv::cvarrToMat(_kernel);
3057
+
3058
+ CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
3059
+
3060
+ cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
3061
+ }
3062
+
3063
+ /* End of file. */