imagecore 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/.gitignore +24 -0
  2. data/Gemfile +4 -0
  3. data/Rakefile +2 -0
  4. data/ext/imagecore/analyze_image.cxx +58 -0
  5. data/ext/imagecore/analyze_image.h +6 -0
  6. data/ext/imagecore/extconf.rb +9 -0
  7. data/ext/imagecore/imagecore.cxx +34 -0
  8. data/ext/opencv/core/___.c +3 -0
  9. data/ext/opencv/core/alloc.cpp +697 -0
  10. data/ext/opencv/core/array.cpp +3206 -0
  11. data/ext/opencv/core/datastructs.cpp +4064 -0
  12. data/ext/opencv/core/extconf.rb +22 -0
  13. data/ext/opencv/core/matrix.cpp +3777 -0
  14. data/ext/opencv/core/precomp.hpp +216 -0
  15. data/ext/opencv/core/system.cpp +832 -0
  16. data/ext/opencv/core/tables.cpp +3512 -0
  17. data/ext/opencv/highgui/___.c +3 -0
  18. data/ext/opencv/highgui/bitstrm.cpp +582 -0
  19. data/ext/opencv/highgui/bitstrm.hpp +182 -0
  20. data/ext/opencv/highgui/extconf.rb +28 -0
  21. data/ext/opencv/highgui/grfmt_base.cpp +128 -0
  22. data/ext/opencv/highgui/grfmt_base.hpp +113 -0
  23. data/ext/opencv/highgui/grfmt_bmp.cpp +564 -0
  24. data/ext/opencv/highgui/grfmt_bmp.hpp +99 -0
  25. data/ext/opencv/highgui/grfmt_exr.hpp +113 -0
  26. data/ext/opencv/highgui/grfmt_imageio.hpp +56 -0
  27. data/ext/opencv/highgui/grfmt_jpeg.cpp +622 -0
  28. data/ext/opencv/highgui/grfmt_jpeg.hpp +90 -0
  29. data/ext/opencv/highgui/grfmt_jpeg2000.cpp +529 -0
  30. data/ext/opencv/highgui/grfmt_jpeg2000.hpp +95 -0
  31. data/ext/opencv/highgui/grfmt_png.cpp +406 -0
  32. data/ext/opencv/highgui/grfmt_png.hpp +101 -0
  33. data/ext/opencv/highgui/grfmt_pxm.cpp +513 -0
  34. data/ext/opencv/highgui/grfmt_pxm.hpp +92 -0
  35. data/ext/opencv/highgui/grfmt_sunras.cpp +425 -0
  36. data/ext/opencv/highgui/grfmt_sunras.hpp +105 -0
  37. data/ext/opencv/highgui/grfmt_tiff.cpp +718 -0
  38. data/ext/opencv/highgui/grfmt_tiff.hpp +136 -0
  39. data/ext/opencv/highgui/grfmts.hpp +56 -0
  40. data/ext/opencv/highgui/loadsave.cpp +535 -0
  41. data/ext/opencv/highgui/precomp.hpp +223 -0
  42. data/ext/opencv/highgui/utils.cpp +689 -0
  43. data/ext/opencv/highgui/utils.hpp +128 -0
  44. data/ext/opencv/imgproc/___.c +3 -0
  45. data/ext/opencv/imgproc/_geom.h +72 -0
  46. data/ext/opencv/imgproc/color.cpp +3179 -0
  47. data/ext/opencv/imgproc/contours.cpp +1780 -0
  48. data/ext/opencv/imgproc/extconf.rb +11 -0
  49. data/ext/opencv/imgproc/filter.cpp +3063 -0
  50. data/ext/opencv/imgproc/precomp.hpp +159 -0
  51. data/ext/opencv/imgproc/shapedescr.cpp +1306 -0
  52. data/ext/opencv/imgproc/smooth.cpp +1566 -0
  53. data/ext/opencv/imgproc/tables.cpp +214 -0
  54. data/ext/opencv/imgproc/thresh.cpp +636 -0
  55. data/ext/opencv/imgproc/utils.cpp +242 -0
  56. data/ext/opencv/include/opencv2/core/core.hpp +4344 -0
  57. data/ext/opencv/include/opencv2/core/core_c.h +1885 -0
  58. data/ext/opencv/include/opencv2/core/internal.hpp +710 -0
  59. data/ext/opencv/include/opencv2/core/mat.hpp +2557 -0
  60. data/ext/opencv/include/opencv2/core/operations.hpp +3623 -0
  61. data/ext/opencv/include/opencv2/core/types_c.h +1875 -0
  62. data/ext/opencv/include/opencv2/core/version.hpp +58 -0
  63. data/ext/opencv/include/opencv2/highgui/highgui.hpp +198 -0
  64. data/ext/opencv/include/opencv2/highgui/highgui_c.h +506 -0
  65. data/ext/opencv/include/opencv2/imgproc/imgproc.hpp +1139 -0
  66. data/ext/opencv/include/opencv2/imgproc/imgproc_c.h +783 -0
  67. data/ext/opencv/include/opencv2/imgproc/types_c.h +538 -0
  68. data/imagecore.gemspec +20 -0
  69. data/lib/imagecore.rb +16 -0
  70. data/lib/imagecore/version.rb +3 -0
  71. metadata +119 -0
@@ -0,0 +1,11 @@
1
+ require 'mkmf'
2
+
3
+ # not valid for C++ code
4
+ $warnflags = ($warnflags.to_s.split - %w(-Wdeclaration-after-statement -Wimplicit-function-declaration)) * ' '
5
+
6
+ # OpenCV includes
7
+ $INCFLAGS << ' -I ../include'
8
+
9
+ create_header('cvconfig.h')
10
+
11
+ create_makefile("opencv_imgproc")
@@ -0,0 +1,3063 @@
1
+ /*M///////////////////////////////////////////////////////////////////////////////////////
2
+ //
3
+ // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
+ //
5
+ // By downloading, copying, installing or using the software you agree to this license.
6
+ // If you do not agree to this license, do not download, install,
7
+ // copy or use the software.
8
+ //
9
+ //
10
+ // License Agreement
11
+ // For Open Source Computer Vision Library
12
+ //
13
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
+ // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
+ // Third party copyrights are property of their respective owners.
16
+ //
17
+ // Redistribution and use in source and binary forms, with or without modification,
18
+ // are permitted provided that the following conditions are met:
19
+ //
20
+ // * Redistribution's of source code must retain the above copyright notice,
21
+ // this list of conditions and the following disclaimer.
22
+ //
23
+ // * Redistribution's in binary form must reproduce the above copyright notice,
24
+ // this list of conditions and the following disclaimer in the documentation
25
+ // and/or other materials provided with the distribution.
26
+ //
27
+ // * The name of the copyright holders may not be used to endorse or promote products
28
+ // derived from this software without specific prior written permission.
29
+ //
30
+ // This software is provided by the copyright holders and contributors "as is" and
31
+ // any express or implied warranties, including, but not limited to, the implied
32
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
33
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
34
+ // indirect, incidental, special, exemplary, or consequential damages
35
+ // (including, but not limited to, procurement of substitute goods or services;
36
+ // loss of use, data, or profits; or business interruption) however caused
37
+ // and on any theory of liability, whether in contract, strict liability,
38
+ // or tort (including negligence or otherwise) arising in any way out of
39
+ // the use of this software, even if advised of the possibility of such damage.
40
+ //
41
+ //M*/
42
+
43
+ #include "precomp.hpp"
44
+
45
+ /****************************************************************************************\
46
+ Base Image Filter
47
+ \****************************************************************************************/
48
+
49
+ /*
50
+ Various border types, image boundaries are denoted with '|'
51
+
52
+ * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
53
+ * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
54
+ * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
55
+ * BORDER_WRAP: cdefgh|abcdefgh|abcdefg
56
+ * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i'
57
+ */
58
+ int cv::borderInterpolate( int p, int len, int borderType )
59
+ {
60
+ if( (unsigned)p < (unsigned)len )
61
+ ;
62
+ else if( borderType == BORDER_REPLICATE )
63
+ p = p < 0 ? 0 : len - 1;
64
+ else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
65
+ {
66
+ int delta = borderType == BORDER_REFLECT_101;
67
+ if( len == 1 )
68
+ return 0;
69
+ do
70
+ {
71
+ if( p < 0 )
72
+ p = -p - 1 + delta;
73
+ else
74
+ p = len - 1 - (p - len) - delta;
75
+ }
76
+ while( (unsigned)p >= (unsigned)len );
77
+ }
78
+ else if( borderType == BORDER_WRAP )
79
+ {
80
+ if( p < 0 )
81
+ p -= ((p-len+1)/len)*len;
82
+ if( p >= len )
83
+ p %= len;
84
+ }
85
+ else if( borderType == BORDER_CONSTANT )
86
+ p = -1;
87
+ else
88
+ CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
89
+ return p;
90
+ }
91
+
92
+
93
+ namespace cv
94
+ {
95
+
96
+ BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
97
+ BaseRowFilter::~BaseRowFilter() {}
98
+
99
+ BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
100
+ BaseColumnFilter::~BaseColumnFilter() {}
101
+ void BaseColumnFilter::reset() {}
102
+
103
+ BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
104
+ BaseFilter::~BaseFilter() {}
105
+ void BaseFilter::reset() {}
106
+
107
+ FilterEngine::FilterEngine()
108
+ {
109
+ srcType = dstType = bufType = -1;
110
+ rowBorderType = columnBorderType = BORDER_REPLICATE;
111
+ bufStep = startY = startY0 = endY = rowCount = dstY = 0;
112
+ maxWidth = 0;
113
+
114
+ wholeSize = Size(-1,-1);
115
+ }
116
+
117
+
118
+ FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
119
+ const Ptr<BaseRowFilter>& _rowFilter,
120
+ const Ptr<BaseColumnFilter>& _columnFilter,
121
+ int _srcType, int _dstType, int _bufType,
122
+ int _rowBorderType, int _columnBorderType,
123
+ const Scalar& _borderValue )
124
+ {
125
+ init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
126
+ _rowBorderType, _columnBorderType, _borderValue);
127
+ }
128
+
129
+ FilterEngine::~FilterEngine()
130
+ {
131
+ }
132
+
133
+
134
+ void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
135
+ const Ptr<BaseRowFilter>& _rowFilter,
136
+ const Ptr<BaseColumnFilter>& _columnFilter,
137
+ int _srcType, int _dstType, int _bufType,
138
+ int _rowBorderType, int _columnBorderType,
139
+ const Scalar& _borderValue )
140
+ {
141
+ _srcType = CV_MAT_TYPE(_srcType);
142
+ _bufType = CV_MAT_TYPE(_bufType);
143
+ _dstType = CV_MAT_TYPE(_dstType);
144
+
145
+ srcType = _srcType;
146
+ int srcElemSize = (int)getElemSize(srcType);
147
+ dstType = _dstType;
148
+ bufType = _bufType;
149
+
150
+ filter2D = _filter2D;
151
+ rowFilter = _rowFilter;
152
+ columnFilter = _columnFilter;
153
+
154
+ if( _columnBorderType < 0 )
155
+ _columnBorderType = _rowBorderType;
156
+
157
+ rowBorderType = _rowBorderType;
158
+ columnBorderType = _columnBorderType;
159
+
160
+ CV_Assert( columnBorderType != BORDER_WRAP );
161
+
162
+ if( isSeparable() )
163
+ {
164
+ CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
165
+ ksize = Size(rowFilter->ksize, columnFilter->ksize);
166
+ anchor = Point(rowFilter->anchor, columnFilter->anchor);
167
+ }
168
+ else
169
+ {
170
+ CV_Assert( bufType == srcType );
171
+ ksize = filter2D->ksize;
172
+ anchor = filter2D->anchor;
173
+ }
174
+
175
+ CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
176
+ 0 <= anchor.y && anchor.y < ksize.height );
177
+
178
+ borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
179
+ int borderLength = std::max(ksize.width - 1, 1);
180
+ borderTab.resize(borderLength*borderElemSize);
181
+
182
+ maxWidth = bufStep = 0;
183
+ constBorderRow.clear();
184
+
185
+ if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
186
+ {
187
+ constBorderValue.resize(srcElemSize*borderLength);
188
+ scalarToRawData(_borderValue, &constBorderValue[0], srcType,
189
+ borderLength*CV_MAT_CN(srcType));
190
+ }
191
+
192
+ wholeSize = Size(-1,-1);
193
+ }
194
+
195
+ static const int VEC_ALIGN = CV_MALLOC_ALIGN;
196
+
197
+ int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
198
+ {
199
+ int i, j;
200
+
201
+ wholeSize = _wholeSize;
202
+ roi = _roi;
203
+ CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
204
+ roi.x + roi.width <= wholeSize.width &&
205
+ roi.y + roi.height <= wholeSize.height );
206
+
207
+ int esz = (int)getElemSize(srcType);
208
+ int bufElemSize = (int)getElemSize(bufType);
209
+ const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
210
+
211
+ if( _maxBufRows < 0 )
212
+ _maxBufRows = ksize.height + 3;
213
+ _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
214
+
215
+ if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
216
+ {
217
+ rows.resize(_maxBufRows);
218
+ maxWidth = std::max(maxWidth, roi.width);
219
+ int cn = CV_MAT_CN(srcType);
220
+ srcRow.resize(esz*(maxWidth + ksize.width - 1));
221
+ if( columnBorderType == BORDER_CONSTANT )
222
+ {
223
+ constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
224
+ uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
225
+ int n = (int)constBorderValue.size(), N;
226
+ N = (maxWidth + ksize.width - 1)*esz;
227
+ tdst = isSeparable() ? &srcRow[0] : dst;
228
+
229
+ for( i = 0; i < N; i += n )
230
+ {
231
+ n = std::min( n, N - i );
232
+ for(j = 0; j < n; j++)
233
+ tdst[i+j] = constVal[j];
234
+ }
235
+
236
+ if( isSeparable() )
237
+ (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
238
+ }
239
+
240
+ int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
241
+ (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
242
+ ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
243
+ }
244
+
245
+ // adjust bufstep so that the used part of the ring buffer stays compact in memory
246
+ bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
247
+
248
+ dx1 = std::max(anchor.x - roi.x, 0);
249
+ dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
250
+
251
+ // recompute border tables
252
+ if( dx1 > 0 || dx2 > 0 )
253
+ {
254
+ if( rowBorderType == BORDER_CONSTANT )
255
+ {
256
+ int nr = isSeparable() ? 1 : (int)rows.size();
257
+ for( i = 0; i < nr; i++ )
258
+ {
259
+ uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
260
+ memcpy( dst, constVal, dx1*esz );
261
+ memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
262
+ }
263
+ }
264
+ else
265
+ {
266
+ int xofs1 = std::min(roi.x, anchor.x) - roi.x;
267
+
268
+ int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
269
+ int* btab = (int*)&borderTab[0];
270
+
271
+ for( i = 0; i < dx1; i++ )
272
+ {
273
+ int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
274
+ for( j = 0; j < btab_esz; j++ )
275
+ btab[i*btab_esz + j] = p0 + j;
276
+ }
277
+
278
+ for( i = 0; i < dx2; i++ )
279
+ {
280
+ int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
281
+ for( j = 0; j < btab_esz; j++ )
282
+ btab[(i + dx1)*btab_esz + j] = p0 + j;
283
+ }
284
+ }
285
+ }
286
+
287
+ rowCount = dstY = 0;
288
+ startY = startY0 = std::max(roi.y - anchor.y, 0);
289
+ endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
290
+ if( !columnFilter.empty() )
291
+ columnFilter->reset();
292
+ if( !filter2D.empty() )
293
+ filter2D->reset();
294
+
295
+ return startY;
296
+ }
297
+
298
+
299
+ int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
300
+ bool isolated, int maxBufRows)
301
+ {
302
+ Rect srcRoi = _srcRoi;
303
+
304
+ if( srcRoi == Rect(0,0,-1,-1) )
305
+ srcRoi = Rect(0,0,src.cols,src.rows);
306
+
307
+ CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
308
+ srcRoi.width >= 0 && srcRoi.height >= 0 &&
309
+ srcRoi.x + srcRoi.width <= src.cols &&
310
+ srcRoi.y + srcRoi.height <= src.rows );
311
+
312
+ Point ofs;
313
+ Size wholeSize(src.cols, src.rows);
314
+ if( !isolated )
315
+ src.locateROI( wholeSize, ofs );
316
+ start( wholeSize, srcRoi + ofs, maxBufRows );
317
+
318
+ return startY - ofs.y;
319
+ }
320
+
321
+
322
+ int FilterEngine::remainingInputRows() const
323
+ {
324
+ return endY - startY - rowCount;
325
+ }
326
+
327
+ int FilterEngine::remainingOutputRows() const
328
+ {
329
+ return roi.height - dstY;
330
+ }
331
+
332
+ int FilterEngine::proceed( const uchar* src, int srcstep, int count,
333
+ uchar* dst, int dststep )
334
+ {
335
+ CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
336
+
337
+ const int *btab = &borderTab[0];
338
+ int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
339
+ uchar** brows = &rows[0];
340
+ int bufRows = (int)rows.size();
341
+ int cn = CV_MAT_CN(bufType);
342
+ int width = roi.width, kwidth = ksize.width;
343
+ int kheight = ksize.height, ay = anchor.y;
344
+ int _dx1 = dx1, _dx2 = dx2;
345
+ int width1 = roi.width + kwidth - 1;
346
+ int xofs1 = std::min(roi.x, anchor.x);
347
+ bool isSep = isSeparable();
348
+ bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
349
+ int dy = 0, i = 0;
350
+
351
+ src -= xofs1*esz;
352
+ count = std::min(count, remainingInputRows());
353
+
354
+ CV_Assert( src && dst && count > 0 );
355
+
356
+ for(;; dst += dststep*i, dy += i)
357
+ {
358
+ int dcount = bufRows - ay - startY - rowCount + roi.y;
359
+ dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
360
+ dcount = std::min(dcount, count);
361
+ count -= dcount;
362
+ for( ; dcount-- > 0; src += srcstep )
363
+ {
364
+ int bi = (startY - startY0 + rowCount) % bufRows;
365
+ uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
366
+ uchar* row = isSep ? &srcRow[0] : brow;
367
+
368
+ if( ++rowCount > bufRows )
369
+ {
370
+ --rowCount;
371
+ ++startY;
372
+ }
373
+
374
+ memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
375
+
376
+ if( makeBorder )
377
+ {
378
+ if( btab_esz*(int)sizeof(int) == esz )
379
+ {
380
+ const int* isrc = (const int*)src;
381
+ int* irow = (int*)row;
382
+
383
+ for( i = 0; i < _dx1*btab_esz; i++ )
384
+ irow[i] = isrc[btab[i]];
385
+ for( i = 0; i < _dx2*btab_esz; i++ )
386
+ irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
387
+ }
388
+ else
389
+ {
390
+ for( i = 0; i < _dx1*esz; i++ )
391
+ row[i] = src[btab[i]];
392
+ for( i = 0; i < _dx2*esz; i++ )
393
+ row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
394
+ }
395
+ }
396
+
397
+ if( isSep )
398
+ (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
399
+ }
400
+
401
+ int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
402
+ for( i = 0; i < max_i; i++ )
403
+ {
404
+ int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
405
+ wholeSize.height, columnBorderType);
406
+ if( srcY < 0 ) // can happen only with constant border type
407
+ brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
408
+ else
409
+ {
410
+ CV_Assert( srcY >= startY );
411
+ if( srcY >= startY + rowCount )
412
+ break;
413
+ int bi = (srcY - startY0) % bufRows;
414
+ brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
415
+ }
416
+ }
417
+ if( i < kheight )
418
+ break;
419
+ i -= kheight - 1;
420
+ if( isSeparable() )
421
+ (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
422
+ else
423
+ (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
424
+ }
425
+
426
+ dstY += dy;
427
+ CV_Assert( dstY <= roi.height );
428
+ return dy;
429
+ }
430
+
431
+
432
+ void FilterEngine::apply(const Mat& src, Mat& dst,
433
+ const Rect& _srcRoi, Point dstOfs, bool isolated)
434
+ {
435
+ CV_Assert( src.type() == srcType && dst.type() == dstType );
436
+
437
+ Rect srcRoi = _srcRoi;
438
+ if( srcRoi == Rect(0,0,-1,-1) )
439
+ srcRoi = Rect(0,0,src.cols,src.rows);
440
+
441
+ if( srcRoi.area() == 0 )
442
+ return;
443
+
444
+ CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
445
+ dstOfs.x + srcRoi.width <= dst.cols &&
446
+ dstOfs.y + srcRoi.height <= dst.rows );
447
+
448
+ int y = start(src, srcRoi, isolated);
449
+ proceed( src.data + y*src.step, (int)src.step, endY - startY,
450
+ dst.data + dstOfs.y*dst.step + dstOfs.x*dst.elemSize(), (int)dst.step );
451
+ }
452
+
453
+ }
454
+
455
+ /****************************************************************************************\
456
+ * Separable linear filter *
457
+ \****************************************************************************************/
458
+
459
+ int cv::getKernelType(InputArray filter_kernel, Point anchor)
460
+ {
461
+ Mat _kernel = filter_kernel.getMat();
462
+ CV_Assert( _kernel.channels() == 1 );
463
+ int i, sz = _kernel.rows*_kernel.cols;
464
+
465
+ Mat kernel;
466
+ _kernel.convertTo(kernel, CV_64F);
467
+
468
+ const double* coeffs = (double*)kernel.data;
469
+ double sum = 0;
470
+ int type = KERNEL_SMOOTH + KERNEL_INTEGER;
471
+ if( (_kernel.rows == 1 || _kernel.cols == 1) &&
472
+ anchor.x*2 + 1 == _kernel.cols &&
473
+ anchor.y*2 + 1 == _kernel.rows )
474
+ type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
475
+
476
+ for( i = 0; i < sz; i++ )
477
+ {
478
+ double a = coeffs[i], b = coeffs[sz - i - 1];
479
+ if( a != b )
480
+ type &= ~KERNEL_SYMMETRICAL;
481
+ if( a != -b )
482
+ type &= ~KERNEL_ASYMMETRICAL;
483
+ if( a < 0 )
484
+ type &= ~KERNEL_SMOOTH;
485
+ if( a != saturate_cast<int>(a) )
486
+ type &= ~KERNEL_INTEGER;
487
+ sum += a;
488
+ }
489
+
490
+ if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
491
+ type &= ~KERNEL_SMOOTH;
492
+ return type;
493
+ }
494
+
495
+
496
+ namespace cv
497
+ {
498
+
499
+ struct RowNoVec
500
+ {
501
+ RowNoVec() {}
502
+ RowNoVec(const Mat&) {}
503
+ int operator()(const uchar*, uchar*, int, int) const { return 0; }
504
+ };
505
+
506
+ struct ColumnNoVec
507
+ {
508
+ ColumnNoVec() {}
509
+ ColumnNoVec(const Mat&, int, int, double) {}
510
+ int operator()(const uchar**, uchar*, int) const { return 0; }
511
+ };
512
+
513
+ struct SymmRowSmallNoVec
514
+ {
515
+ SymmRowSmallNoVec() {}
516
+ SymmRowSmallNoVec(const Mat&, int) {}
517
+ int operator()(const uchar*, uchar*, int, int) const { return 0; }
518
+ };
519
+
520
+ struct SymmColumnSmallNoVec
521
+ {
522
+ SymmColumnSmallNoVec() {}
523
+ SymmColumnSmallNoVec(const Mat&, int, int, double) {}
524
+ int operator()(const uchar**, uchar*, int) const { return 0; }
525
+ };
526
+
527
+ struct FilterNoVec
528
+ {
529
+ FilterNoVec() {}
530
+ FilterNoVec(const Mat&, int, double) {}
531
+ int operator()(const uchar**, uchar*, int) const { return 0; }
532
+ };
533
+
534
+
535
+ #if CV_SSE2
536
+
537
+ ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
538
+
539
+ struct RowVec_8u32s
540
+ {
541
+ RowVec_8u32s() { smallValues = false; }
542
+ RowVec_8u32s( const Mat& _kernel )
543
+ {
544
+ kernel = _kernel;
545
+ smallValues = true;
546
+ int k, ksize = kernel.rows + kernel.cols - 1;
547
+ for( k = 0; k < ksize; k++ )
548
+ {
549
+ int v = ((const int*)kernel.data)[k];
550
+ if( v < SHRT_MIN || v > SHRT_MAX )
551
+ {
552
+ smallValues = false;
553
+ break;
554
+ }
555
+ }
556
+ }
557
+
558
+ int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
559
+ {
560
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
561
+ return 0;
562
+
563
+ int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
564
+ int* dst = (int*)_dst;
565
+ const int* _kx = (const int*)kernel.data;
566
+ width *= cn;
567
+
568
+ if( smallValues )
569
+ {
570
+ for( ; i <= width - 16; i += 16 )
571
+ {
572
+ const uchar* src = _src + i;
573
+ __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
574
+ __m128i x0, x1, x2, x3;
575
+
576
+ for( k = 0; k < _ksize; k++, src += cn )
577
+ {
578
+ f = _mm_cvtsi32_si128(_kx[k]);
579
+ f = _mm_shuffle_epi32(f, 0);
580
+ f = _mm_packs_epi32(f, f);
581
+
582
+ x0 = _mm_loadu_si128((const __m128i*)src);
583
+ x2 = _mm_unpackhi_epi8(x0, z);
584
+ x0 = _mm_unpacklo_epi8(x0, z);
585
+ x1 = _mm_mulhi_epi16(x0, f);
586
+ x3 = _mm_mulhi_epi16(x2, f);
587
+ x0 = _mm_mullo_epi16(x0, f);
588
+ x2 = _mm_mullo_epi16(x2, f);
589
+
590
+ s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
591
+ s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
592
+ s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
593
+ s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
594
+ }
595
+
596
+ _mm_store_si128((__m128i*)(dst + i), s0);
597
+ _mm_store_si128((__m128i*)(dst + i + 4), s1);
598
+ _mm_store_si128((__m128i*)(dst + i + 8), s2);
599
+ _mm_store_si128((__m128i*)(dst + i + 12), s3);
600
+ }
601
+
602
+ for( ; i <= width - 4; i += 4 )
603
+ {
604
+ const uchar* src = _src + i;
605
+ __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
606
+
607
+ for( k = 0; k < _ksize; k++, src += cn )
608
+ {
609
+ f = _mm_cvtsi32_si128(_kx[k]);
610
+ f = _mm_shuffle_epi32(f, 0);
611
+ f = _mm_packs_epi32(f, f);
612
+
613
+ x0 = _mm_cvtsi32_si128(*(const int*)src);
614
+ x0 = _mm_unpacklo_epi8(x0, z);
615
+ x1 = _mm_mulhi_epi16(x0, f);
616
+ x0 = _mm_mullo_epi16(x0, f);
617
+ s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
618
+ }
619
+ _mm_store_si128((__m128i*)(dst + i), s0);
620
+ }
621
+ }
622
+ return i;
623
+ }
624
+
625
+ Mat kernel;
626
+ bool smallValues;
627
+ };
628
+
629
+
630
+ struct SymmRowSmallVec_8u32s
631
+ {
632
+ SymmRowSmallVec_8u32s() { smallValues = false; }
633
+ SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
634
+ {
635
+ kernel = _kernel;
636
+ symmetryType = _symmetryType;
637
+ smallValues = true;
638
+ int k, ksize = kernel.rows + kernel.cols - 1;
639
+ for( k = 0; k < ksize; k++ )
640
+ {
641
+ int v = ((const int*)kernel.data)[k];
642
+ if( v < SHRT_MIN || v > SHRT_MAX )
643
+ {
644
+ smallValues = false;
645
+ break;
646
+ }
647
+ }
648
+ }
649
+
650
+ int operator()(const uchar* src, uchar* _dst, int width, int cn) const
651
+ {
652
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
653
+ return 0;
654
+
655
+ int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
656
+ int* dst = (int*)_dst;
657
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
658
+ const int* kx = (const int*)kernel.data + _ksize/2;
659
+ if( !smallValues )
660
+ return 0;
661
+
662
+ src += (_ksize/2)*cn;
663
+ width *= cn;
664
+
665
+ __m128i z = _mm_setzero_si128();
666
+ if( symmetrical )
667
+ {
668
+ if( _ksize == 1 )
669
+ return 0;
670
+ if( _ksize == 3 )
671
+ {
672
+ if( kx[0] == 2 && kx[1] == 1 )
673
+ for( ; i <= width - 16; i += 16, src += 16 )
674
+ {
675
+ __m128i x0, x1, x2, y0, y1, y2;
676
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
677
+ x1 = _mm_loadu_si128((__m128i*)src);
678
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
679
+ y0 = _mm_unpackhi_epi8(x0, z);
680
+ x0 = _mm_unpacklo_epi8(x0, z);
681
+ y1 = _mm_unpackhi_epi8(x1, z);
682
+ x1 = _mm_unpacklo_epi8(x1, z);
683
+ y2 = _mm_unpackhi_epi8(x2, z);
684
+ x2 = _mm_unpacklo_epi8(x2, z);
685
+ x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
686
+ y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
687
+ _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
688
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
689
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
690
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
691
+ }
692
+ else if( kx[0] == -2 && kx[1] == 1 )
693
+ for( ; i <= width - 16; i += 16, src += 16 )
694
+ {
695
+ __m128i x0, x1, x2, y0, y1, y2;
696
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
697
+ x1 = _mm_loadu_si128((__m128i*)src);
698
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
699
+ y0 = _mm_unpackhi_epi8(x0, z);
700
+ x0 = _mm_unpacklo_epi8(x0, z);
701
+ y1 = _mm_unpackhi_epi8(x1, z);
702
+ x1 = _mm_unpacklo_epi8(x1, z);
703
+ y2 = _mm_unpackhi_epi8(x2, z);
704
+ x2 = _mm_unpacklo_epi8(x2, z);
705
+ x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
706
+ y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
707
+ _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
708
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
709
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
710
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
711
+ }
712
+ else
713
+ {
714
+ __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
715
+ k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
716
+ k0 = _mm_packs_epi32(k0, k0);
717
+ k1 = _mm_packs_epi32(k1, k1);
718
+
719
+ for( ; i <= width - 16; i += 16, src += 16 )
720
+ {
721
+ __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
722
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
723
+ x1 = _mm_loadu_si128((__m128i*)src);
724
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
725
+ y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
726
+ x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
727
+ y1 = _mm_unpackhi_epi8(x1, z);
728
+ x1 = _mm_unpacklo_epi8(x1, z);
729
+
730
+ t1 = _mm_mulhi_epi16(x1, k0);
731
+ t0 = _mm_mullo_epi16(x1, k0);
732
+ x2 = _mm_mulhi_epi16(x0, k1);
733
+ x0 = _mm_mullo_epi16(x0, k1);
734
+ z0 = _mm_unpacklo_epi16(t0, t1);
735
+ z1 = _mm_unpackhi_epi16(t0, t1);
736
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
737
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
738
+
739
+ t1 = _mm_mulhi_epi16(y1, k0);
740
+ t0 = _mm_mullo_epi16(y1, k0);
741
+ y1 = _mm_mulhi_epi16(y0, k1);
742
+ y0 = _mm_mullo_epi16(y0, k1);
743
+ z2 = _mm_unpacklo_epi16(t0, t1);
744
+ z3 = _mm_unpackhi_epi16(t0, t1);
745
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
746
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
747
+ _mm_store_si128((__m128i*)(dst + i), z0);
748
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
749
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
750
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
751
+ }
752
+ }
753
+ }
754
+ else if( _ksize == 5 )
755
+ {
756
+ if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
757
+ for( ; i <= width - 16; i += 16, src += 16 )
758
+ {
759
+ __m128i x0, x1, x2, y0, y1, y2;
760
+ x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
761
+ x1 = _mm_loadu_si128((__m128i*)src);
762
+ x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
763
+ y0 = _mm_unpackhi_epi8(x0, z);
764
+ x0 = _mm_unpacklo_epi8(x0, z);
765
+ y1 = _mm_unpackhi_epi8(x1, z);
766
+ x1 = _mm_unpacklo_epi8(x1, z);
767
+ y2 = _mm_unpackhi_epi8(x2, z);
768
+ x2 = _mm_unpacklo_epi8(x2, z);
769
+ x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
770
+ y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
771
+ _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
772
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
773
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
774
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
775
+ }
776
+ else
777
+ {
778
+ __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
779
+ k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
780
+ k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
781
+ k0 = _mm_packs_epi32(k0, k0);
782
+ k1 = _mm_packs_epi32(k1, k1);
783
+ k2 = _mm_packs_epi32(k2, k2);
784
+
785
+ for( ; i <= width - 16; i += 16, src += 16 )
786
+ {
787
+ __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
788
+ x0 = _mm_loadu_si128((__m128i*)(src - cn));
789
+ x1 = _mm_loadu_si128((__m128i*)src);
790
+ x2 = _mm_loadu_si128((__m128i*)(src + cn));
791
+ y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
792
+ x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
793
+ y1 = _mm_unpackhi_epi8(x1, z);
794
+ x1 = _mm_unpacklo_epi8(x1, z);
795
+
796
+ t1 = _mm_mulhi_epi16(x1, k0);
797
+ t0 = _mm_mullo_epi16(x1, k0);
798
+ x2 = _mm_mulhi_epi16(x0, k1);
799
+ x0 = _mm_mullo_epi16(x0, k1);
800
+ z0 = _mm_unpacklo_epi16(t0, t1);
801
+ z1 = _mm_unpackhi_epi16(t0, t1);
802
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
803
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
804
+
805
+ t1 = _mm_mulhi_epi16(y1, k0);
806
+ t0 = _mm_mullo_epi16(y1, k0);
807
+ y1 = _mm_mulhi_epi16(y0, k1);
808
+ y0 = _mm_mullo_epi16(y0, k1);
809
+ z2 = _mm_unpacklo_epi16(t0, t1);
810
+ z3 = _mm_unpackhi_epi16(t0, t1);
811
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
812
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
813
+
814
+ x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
815
+ x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
816
+ y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
817
+ y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
818
+
819
+ t1 = _mm_mulhi_epi16(y0, k2);
820
+ t0 = _mm_mullo_epi16(y0, k2);
821
+ y0 = _mm_mullo_epi16(y1, k2);
822
+ y1 = _mm_mulhi_epi16(y1, k2);
823
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
824
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
825
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
826
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
827
+
828
+ _mm_store_si128((__m128i*)(dst + i), z0);
829
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
830
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
831
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
832
+ }
833
+ }
834
+ }
835
+ }
836
+ else
837
+ {
838
+ if( _ksize == 3 )
839
+ {
840
+ if( kx[0] == 0 && kx[1] == 1 )
841
+ for( ; i <= width - 16; i += 16, src += 16 )
842
+ {
843
+ __m128i x0, x1, y0;
844
+ x0 = _mm_loadu_si128((__m128i*)(src + cn));
845
+ x1 = _mm_loadu_si128((__m128i*)(src - cn));
846
+ y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
847
+ x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
848
+ _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
849
+ _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
850
+ _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
851
+ _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
852
+ }
853
+ else
854
+ {
855
+ __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
856
+ k1 = _mm_packs_epi32(k1, k1);
857
+
858
+ for( ; i <= width - 16; i += 16, src += 16 )
859
+ {
860
+ __m128i x0, x1, y0, y1, z0, z1, z2, z3;
861
+ x0 = _mm_loadu_si128((__m128i*)(src + cn));
862
+ x1 = _mm_loadu_si128((__m128i*)(src - cn));
863
+ y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
864
+ x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
865
+
866
+ x1 = _mm_mulhi_epi16(x0, k1);
867
+ x0 = _mm_mullo_epi16(x0, k1);
868
+ z0 = _mm_unpacklo_epi16(x0, x1);
869
+ z1 = _mm_unpackhi_epi16(x0, x1);
870
+
871
+ y1 = _mm_mulhi_epi16(y0, k1);
872
+ y0 = _mm_mullo_epi16(y0, k1);
873
+ z2 = _mm_unpacklo_epi16(y0, y1);
874
+ z3 = _mm_unpackhi_epi16(y0, y1);
875
+ _mm_store_si128((__m128i*)(dst + i), z0);
876
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
877
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
878
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
879
+ }
880
+ }
881
+ }
882
+ else if( _ksize == 5 )
883
+ {
884
+ __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
885
+ k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
886
+ k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
887
+ k0 = _mm_packs_epi32(k0, k0);
888
+ k1 = _mm_packs_epi32(k1, k1);
889
+ k2 = _mm_packs_epi32(k2, k2);
890
+
891
+ for( ; i <= width - 16; i += 16, src += 16 )
892
+ {
893
+ __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
894
+ x0 = _mm_loadu_si128((__m128i*)(src + cn));
895
+ x2 = _mm_loadu_si128((__m128i*)(src - cn));
896
+ y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
897
+ x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
898
+
899
+ x2 = _mm_mulhi_epi16(x0, k1);
900
+ x0 = _mm_mullo_epi16(x0, k1);
901
+ z0 = _mm_unpacklo_epi16(x0, x2);
902
+ z1 = _mm_unpackhi_epi16(x0, x2);
903
+ y1 = _mm_mulhi_epi16(y0, k1);
904
+ y0 = _mm_mullo_epi16(y0, k1);
905
+ z2 = _mm_unpacklo_epi16(y0, y1);
906
+ z3 = _mm_unpackhi_epi16(y0, y1);
907
+
908
+ x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
909
+ x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
910
+ y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
911
+ y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
912
+
913
+ t1 = _mm_mulhi_epi16(y0, k2);
914
+ t0 = _mm_mullo_epi16(y0, k2);
915
+ y0 = _mm_mullo_epi16(y1, k2);
916
+ y1 = _mm_mulhi_epi16(y1, k2);
917
+ z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
918
+ z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
919
+ z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
920
+ z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
921
+
922
+ _mm_store_si128((__m128i*)(dst + i), z0);
923
+ _mm_store_si128((__m128i*)(dst + i + 4), z1);
924
+ _mm_store_si128((__m128i*)(dst + i + 8), z2);
925
+ _mm_store_si128((__m128i*)(dst + i + 12), z3);
926
+ }
927
+ }
928
+ }
929
+
930
+ src -= (_ksize/2)*cn;
931
+ kx -= _ksize/2;
932
+ for( ; i <= width - 4; i += 4, src += 4 )
933
+ {
934
+ __m128i f, s0 = z, x0, x1;
935
+
936
+ for( k = j = 0; k < _ksize; k++, j += cn )
937
+ {
938
+ f = _mm_cvtsi32_si128(kx[k]);
939
+ f = _mm_shuffle_epi32(f, 0);
940
+ f = _mm_packs_epi32(f, f);
941
+
942
+ x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
943
+ x0 = _mm_unpacklo_epi8(x0, z);
944
+ x1 = _mm_mulhi_epi16(x0, f);
945
+ x0 = _mm_mullo_epi16(x0, f);
946
+ s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
947
+ }
948
+ _mm_store_si128((__m128i*)(dst + i), s0);
949
+ }
950
+
951
+ return i;
952
+ }
953
+
954
+ Mat kernel;
955
+ int symmetryType;
956
+ bool smallValues;
957
+ };
958
+
959
+
960
+ struct SymmColumnVec_32s8u
961
+ {
962
+ SymmColumnVec_32s8u() { symmetryType=0; }
963
+ SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
964
+ {
965
+ symmetryType = _symmetryType;
966
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
967
+ delta = (float)(_delta/(1 << _bits));
968
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
969
+ }
970
+
971
+ int operator()(const uchar** _src, uchar* dst, int width) const
972
+ {
973
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
974
+ return 0;
975
+
976
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
977
+ const float* ky = (const float*)kernel.data + ksize2;
978
+ int i = 0, k;
979
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
980
+ const int** src = (const int**)_src;
981
+ const __m128i *S, *S2;
982
+ __m128 d4 = _mm_set1_ps(delta);
983
+
984
+ if( symmetrical )
985
+ {
986
+ for( ; i <= width - 16; i += 16 )
987
+ {
988
+ __m128 f = _mm_load_ss(ky);
989
+ f = _mm_shuffle_ps(f, f, 0);
990
+ __m128 s0, s1, s2, s3;
991
+ __m128i x0, x1;
992
+ S = (const __m128i*)(src[0] + i);
993
+ s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
994
+ s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
995
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
996
+ s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
997
+ s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
998
+ s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
999
+ s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1000
+ s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1001
+
1002
+ for( k = 1; k <= ksize2; k++ )
1003
+ {
1004
+ S = (const __m128i*)(src[k] + i);
1005
+ S2 = (const __m128i*)(src[-k] + i);
1006
+ f = _mm_load_ss(ky+k);
1007
+ f = _mm_shuffle_ps(f, f, 0);
1008
+ x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1009
+ x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1010
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1011
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1012
+ x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1013
+ x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1014
+ s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1015
+ s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1016
+ }
1017
+
1018
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1019
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1020
+ x0 = _mm_packus_epi16(x0, x1);
1021
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1022
+ }
1023
+
1024
+ for( ; i <= width - 4; i += 4 )
1025
+ {
1026
+ __m128 f = _mm_load_ss(ky);
1027
+ f = _mm_shuffle_ps(f, f, 0);
1028
+ __m128i x0;
1029
+ __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
1030
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1031
+
1032
+ for( k = 1; k <= ksize2; k++ )
1033
+ {
1034
+ S = (const __m128i*)(src[k] + i);
1035
+ S2 = (const __m128i*)(src[-k] + i);
1036
+ f = _mm_load_ss(ky+k);
1037
+ f = _mm_shuffle_ps(f, f, 0);
1038
+ x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1039
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1040
+ }
1041
+
1042
+ x0 = _mm_cvtps_epi32(s0);
1043
+ x0 = _mm_packs_epi32(x0, x0);
1044
+ x0 = _mm_packus_epi16(x0, x0);
1045
+ *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1046
+ }
1047
+ }
1048
+ else
1049
+ {
1050
+ for( ; i <= width - 16; i += 16 )
1051
+ {
1052
+ __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1053
+ __m128i x0, x1;
1054
+
1055
+ for( k = 1; k <= ksize2; k++ )
1056
+ {
1057
+ S = (const __m128i*)(src[k] + i);
1058
+ S2 = (const __m128i*)(src[-k] + i);
1059
+ f = _mm_load_ss(ky+k);
1060
+ f = _mm_shuffle_ps(f, f, 0);
1061
+ x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1062
+ x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1063
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1064
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1065
+ x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1066
+ x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1067
+ s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1068
+ s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1069
+ }
1070
+
1071
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1072
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1073
+ x0 = _mm_packus_epi16(x0, x1);
1074
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1075
+ }
1076
+
1077
+ for( ; i <= width - 4; i += 4 )
1078
+ {
1079
+ __m128 f, s0 = d4;
1080
+ __m128i x0;
1081
+
1082
+ for( k = 1; k <= ksize2; k++ )
1083
+ {
1084
+ S = (const __m128i*)(src[k] + i);
1085
+ S2 = (const __m128i*)(src[-k] + i);
1086
+ f = _mm_load_ss(ky+k);
1087
+ f = _mm_shuffle_ps(f, f, 0);
1088
+ x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1089
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1090
+ }
1091
+
1092
+ x0 = _mm_cvtps_epi32(s0);
1093
+ x0 = _mm_packs_epi32(x0, x0);
1094
+ x0 = _mm_packus_epi16(x0, x0);
1095
+ *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1096
+ }
1097
+ }
1098
+
1099
+ return i;
1100
+ }
1101
+
1102
+ int symmetryType;
1103
+ float delta;
1104
+ Mat kernel;
1105
+ };
1106
+
1107
+
1108
+ struct SymmColumnSmallVec_32s16s
1109
+ {
1110
+ SymmColumnSmallVec_32s16s() { symmetryType=0; }
1111
+ SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1112
+ {
1113
+ symmetryType = _symmetryType;
1114
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1115
+ delta = (float)(_delta/(1 << _bits));
1116
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1117
+ }
1118
+
1119
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1120
+ {
1121
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
1122
+ return 0;
1123
+
1124
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1125
+ const float* ky = (const float*)kernel.data + ksize2;
1126
+ int i = 0;
1127
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1128
+ const int** src = (const int**)_src;
1129
+ const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1130
+ short* dst = (short*)_dst;
1131
+ __m128 df4 = _mm_set1_ps(delta);
1132
+ __m128i d4 = _mm_cvtps_epi32(df4);
1133
+
1134
+ if( symmetrical )
1135
+ {
1136
+ if( ky[0] == 2 && ky[1] == 1 )
1137
+ {
1138
+ for( ; i <= width - 8; i += 8 )
1139
+ {
1140
+ __m128i s0, s1, s2, s3, s4, s5;
1141
+ s0 = _mm_load_si128((__m128i*)(S0 + i));
1142
+ s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1143
+ s2 = _mm_load_si128((__m128i*)(S1 + i));
1144
+ s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1145
+ s4 = _mm_load_si128((__m128i*)(S2 + i));
1146
+ s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1147
+ s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1148
+ s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1149
+ s0 = _mm_add_epi32(s0, d4);
1150
+ s1 = _mm_add_epi32(s1, d4);
1151
+ _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1152
+ }
1153
+ }
1154
+ else if( ky[0] == -2 && ky[1] == 1 )
1155
+ {
1156
+ for( ; i <= width - 8; i += 8 )
1157
+ {
1158
+ __m128i s0, s1, s2, s3, s4, s5;
1159
+ s0 = _mm_load_si128((__m128i*)(S0 + i));
1160
+ s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1161
+ s2 = _mm_load_si128((__m128i*)(S1 + i));
1162
+ s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1163
+ s4 = _mm_load_si128((__m128i*)(S2 + i));
1164
+ s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1165
+ s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1166
+ s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1167
+ s0 = _mm_add_epi32(s0, d4);
1168
+ s1 = _mm_add_epi32(s1, d4);
1169
+ _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1170
+ }
1171
+ }
1172
+ else
1173
+ {
1174
+ __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1175
+ for( ; i <= width - 8; i += 8 )
1176
+ {
1177
+ __m128 s0, s1;
1178
+ s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1179
+ s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1180
+ s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1181
+ s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1182
+ __m128i x0, x1;
1183
+ x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1184
+ _mm_load_si128((__m128i*)(S2 + i)));
1185
+ x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1186
+ _mm_load_si128((__m128i*)(S2 + i + 4)));
1187
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1188
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1189
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1190
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1191
+ }
1192
+ }
1193
+ }
1194
+ else
1195
+ {
1196
+ if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1197
+ {
1198
+ if( ky[1] < 0 )
1199
+ std::swap(S0, S2);
1200
+ for( ; i <= width - 8; i += 8 )
1201
+ {
1202
+ __m128i s0, s1, s2, s3;
1203
+ s0 = _mm_load_si128((__m128i*)(S2 + i));
1204
+ s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1205
+ s2 = _mm_load_si128((__m128i*)(S0 + i));
1206
+ s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1207
+ s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1208
+ s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1209
+ _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1210
+ }
1211
+ }
1212
+ else
1213
+ {
1214
+ __m128 k1 = _mm_set1_ps(ky[1]);
1215
+ for( ; i <= width - 8; i += 8 )
1216
+ {
1217
+ __m128 s0 = df4, s1 = df4;
1218
+ __m128i x0, x1;
1219
+ x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1220
+ _mm_load_si128((__m128i*)(S2 + i)));
1221
+ x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1222
+ _mm_load_si128((__m128i*)(S2 + i + 4)));
1223
+ s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1224
+ s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1225
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1226
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1227
+ }
1228
+ }
1229
+ }
1230
+
1231
+ return i;
1232
+ }
1233
+
1234
+ int symmetryType;
1235
+ float delta;
1236
+ Mat kernel;
1237
+ };
1238
+
1239
+
1240
+ /////////////////////////////////////// 32f //////////////////////////////////
1241
+
1242
+ struct RowVec_32f
1243
+ {
1244
+ RowVec_32f() {}
1245
+ RowVec_32f( const Mat& _kernel )
1246
+ {
1247
+ kernel = _kernel;
1248
+ }
1249
+
1250
+ int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1251
+ {
1252
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1253
+ return 0;
1254
+
1255
+ int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1256
+ float* dst = (float*)_dst;
1257
+ const float* _kx = (const float*)kernel.data;
1258
+ width *= cn;
1259
+
1260
+ for( ; i <= width - 8; i += 8 )
1261
+ {
1262
+ const float* src = (const float*)_src + i;
1263
+ __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1264
+ for( k = 0; k < _ksize; k++, src += cn )
1265
+ {
1266
+ f = _mm_load_ss(_kx+k);
1267
+ f = _mm_shuffle_ps(f, f, 0);
1268
+
1269
+ x0 = _mm_loadu_ps(src);
1270
+ x1 = _mm_loadu_ps(src + 4);
1271
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1272
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1273
+ }
1274
+ _mm_store_ps(dst + i, s0);
1275
+ _mm_store_ps(dst + i + 4, s1);
1276
+ }
1277
+ return i;
1278
+ }
1279
+
1280
+ Mat kernel;
1281
+ };
1282
+
1283
+
1284
+ struct SymmRowSmallVec_32f
1285
+ {
1286
+ SymmRowSmallVec_32f() {}
1287
+ SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1288
+ {
1289
+ kernel = _kernel;
1290
+ symmetryType = _symmetryType;
1291
+ }
1292
+
1293
+ int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1294
+ {
1295
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1296
+ return 0;
1297
+
1298
+ int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1299
+ float* dst = (float*)_dst;
1300
+ const float* src = (const float*)_src + (_ksize/2)*cn;
1301
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1302
+ const float* kx = (const float*)kernel.data + _ksize/2;
1303
+ width *= cn;
1304
+
1305
+ if( symmetrical )
1306
+ {
1307
+ if( _ksize == 1 )
1308
+ return 0;
1309
+ if( _ksize == 3 )
1310
+ {
1311
+ if( kx[0] == 2 && kx[1] == 1 )
1312
+ for( ; i <= width - 8; i += 8, src += 8 )
1313
+ {
1314
+ __m128 x0, x1, x2, y0, y1, y2;
1315
+ x0 = _mm_loadu_ps(src - cn);
1316
+ x1 = _mm_loadu_ps(src);
1317
+ x2 = _mm_loadu_ps(src + cn);
1318
+ y0 = _mm_loadu_ps(src - cn + 4);
1319
+ y1 = _mm_loadu_ps(src + 4);
1320
+ y2 = _mm_loadu_ps(src + cn + 4);
1321
+ x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1322
+ y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1323
+ _mm_store_ps(dst + i, x0);
1324
+ _mm_store_ps(dst + i + 4, y0);
1325
+ }
1326
+ else if( kx[0] == -2 && kx[1] == 1 )
1327
+ for( ; i <= width - 8; i += 8, src += 8 )
1328
+ {
1329
+ __m128 x0, x1, x2, y0, y1, y2;
1330
+ x0 = _mm_loadu_ps(src - cn);
1331
+ x1 = _mm_loadu_ps(src);
1332
+ x2 = _mm_loadu_ps(src + cn);
1333
+ y0 = _mm_loadu_ps(src - cn + 4);
1334
+ y1 = _mm_loadu_ps(src + 4);
1335
+ y2 = _mm_loadu_ps(src + cn + 4);
1336
+ x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1337
+ y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1338
+ _mm_store_ps(dst + i, x0);
1339
+ _mm_store_ps(dst + i + 4, y0);
1340
+ }
1341
+ else
1342
+ {
1343
+ __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1344
+ for( ; i <= width - 8; i += 8, src += 8 )
1345
+ {
1346
+ __m128 x0, x1, x2, y0, y1, y2;
1347
+ x0 = _mm_loadu_ps(src - cn);
1348
+ x1 = _mm_loadu_ps(src);
1349
+ x2 = _mm_loadu_ps(src + cn);
1350
+ y0 = _mm_loadu_ps(src - cn + 4);
1351
+ y1 = _mm_loadu_ps(src + 4);
1352
+ y2 = _mm_loadu_ps(src + cn + 4);
1353
+
1354
+ x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1355
+ y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1356
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1357
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1358
+ _mm_store_ps(dst + i, x0);
1359
+ _mm_store_ps(dst + i + 4, y0);
1360
+ }
1361
+ }
1362
+ }
1363
+ else if( _ksize == 5 )
1364
+ {
1365
+ if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1366
+ for( ; i <= width - 8; i += 8, src += 8 )
1367
+ {
1368
+ __m128 x0, x1, x2, y0, y1, y2;
1369
+ x0 = _mm_loadu_ps(src - cn*2);
1370
+ x1 = _mm_loadu_ps(src);
1371
+ x2 = _mm_loadu_ps(src + cn*2);
1372
+ y0 = _mm_loadu_ps(src - cn*2 + 4);
1373
+ y1 = _mm_loadu_ps(src + 4);
1374
+ y2 = _mm_loadu_ps(src + cn*2 + 4);
1375
+ x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1376
+ y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1377
+ _mm_store_ps(dst + i, x0);
1378
+ _mm_store_ps(dst + i + 4, y0);
1379
+ }
1380
+ else
1381
+ {
1382
+ __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1383
+ for( ; i <= width - 8; i += 8, src += 8 )
1384
+ {
1385
+ __m128 x0, x1, x2, y0, y1, y2;
1386
+ x0 = _mm_loadu_ps(src - cn);
1387
+ x1 = _mm_loadu_ps(src);
1388
+ x2 = _mm_loadu_ps(src + cn);
1389
+ y0 = _mm_loadu_ps(src - cn + 4);
1390
+ y1 = _mm_loadu_ps(src + 4);
1391
+ y2 = _mm_loadu_ps(src + cn + 4);
1392
+
1393
+ x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1394
+ y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1395
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1396
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1397
+
1398
+ x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1399
+ y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1400
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1401
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1402
+
1403
+ _mm_store_ps(dst + i, x0);
1404
+ _mm_store_ps(dst + i + 4, y0);
1405
+ }
1406
+ }
1407
+ }
1408
+ }
1409
+ else
1410
+ {
1411
+ if( _ksize == 3 )
1412
+ {
1413
+ if( kx[0] == 0 && kx[1] == 1 )
1414
+ for( ; i <= width - 8; i += 8, src += 8 )
1415
+ {
1416
+ __m128 x0, x2, y0, y2;
1417
+ x0 = _mm_loadu_ps(src + cn);
1418
+ x2 = _mm_loadu_ps(src - cn);
1419
+ y0 = _mm_loadu_ps(src + cn + 4);
1420
+ y2 = _mm_loadu_ps(src - cn + 4);
1421
+ x0 = _mm_sub_ps(x0, x2);
1422
+ y0 = _mm_sub_ps(y0, y2);
1423
+ _mm_store_ps(dst + i, x0);
1424
+ _mm_store_ps(dst + i + 4, y0);
1425
+ }
1426
+ else
1427
+ {
1428
+ __m128 k1 = _mm_set1_ps(kx[1]);
1429
+ for( ; i <= width - 8; i += 8, src += 8 )
1430
+ {
1431
+ __m128 x0, x2, y0, y2;
1432
+ x0 = _mm_loadu_ps(src + cn);
1433
+ x2 = _mm_loadu_ps(src - cn);
1434
+ y0 = _mm_loadu_ps(src + cn + 4);
1435
+ y2 = _mm_loadu_ps(src - cn + 4);
1436
+
1437
+ x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1438
+ y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1439
+ _mm_store_ps(dst + i, x0);
1440
+ _mm_store_ps(dst + i + 4, y0);
1441
+ }
1442
+ }
1443
+ }
1444
+ else if( _ksize == 5 )
1445
+ {
1446
+ __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1447
+ for( ; i <= width - 8; i += 8, src += 8 )
1448
+ {
1449
+ __m128 x0, x2, y0, y2;
1450
+ x0 = _mm_loadu_ps(src + cn);
1451
+ x2 = _mm_loadu_ps(src - cn);
1452
+ y0 = _mm_loadu_ps(src + cn + 4);
1453
+ y2 = _mm_loadu_ps(src - cn + 4);
1454
+
1455
+ x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1456
+ y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1457
+
1458
+ x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1459
+ y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1460
+ x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1461
+ y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1462
+
1463
+ _mm_store_ps(dst + i, x0);
1464
+ _mm_store_ps(dst + i + 4, y0);
1465
+ }
1466
+ }
1467
+ }
1468
+
1469
+ return i;
1470
+ }
1471
+
1472
+ Mat kernel;
1473
+ int symmetryType;
1474
+ };
1475
+
1476
+
1477
+ struct SymmColumnVec_32f
1478
+ {
1479
+ SymmColumnVec_32f() { symmetryType=0; }
1480
+ SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1481
+ {
1482
+ symmetryType = _symmetryType;
1483
+ kernel = _kernel;
1484
+ delta = (float)_delta;
1485
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1486
+ }
1487
+
1488
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1489
+ {
1490
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1491
+ return 0;
1492
+
1493
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1494
+ const float* ky = (const float*)kernel.data + ksize2;
1495
+ int i = 0, k;
1496
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1497
+ const float** src = (const float**)_src;
1498
+ const float *S, *S2;
1499
+ float* dst = (float*)_dst;
1500
+ __m128 d4 = _mm_set1_ps(delta);
1501
+
1502
+ if( symmetrical )
1503
+ {
1504
+ for( ; i <= width - 16; i += 16 )
1505
+ {
1506
+ __m128 f = _mm_load_ss(ky);
1507
+ f = _mm_shuffle_ps(f, f, 0);
1508
+ __m128 s0, s1, s2, s3;
1509
+ __m128 x0, x1;
1510
+ S = src[0] + i;
1511
+ s0 = _mm_load_ps(S);
1512
+ s1 = _mm_load_ps(S+4);
1513
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1514
+ s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1515
+ s2 = _mm_load_ps(S+8);
1516
+ s3 = _mm_load_ps(S+12);
1517
+ s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1518
+ s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1519
+
1520
+ for( k = 1; k <= ksize2; k++ )
1521
+ {
1522
+ S = src[k] + i;
1523
+ S2 = src[-k] + i;
1524
+ f = _mm_load_ss(ky+k);
1525
+ f = _mm_shuffle_ps(f, f, 0);
1526
+ x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1527
+ x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1528
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1529
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1530
+ x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1531
+ x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1532
+ s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1533
+ s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1534
+ }
1535
+
1536
+ _mm_storeu_ps(dst + i, s0);
1537
+ _mm_storeu_ps(dst + i + 4, s1);
1538
+ _mm_storeu_ps(dst + i + 8, s2);
1539
+ _mm_storeu_ps(dst + i + 12, s3);
1540
+ }
1541
+
1542
+ for( ; i <= width - 4; i += 4 )
1543
+ {
1544
+ __m128 f = _mm_load_ss(ky);
1545
+ f = _mm_shuffle_ps(f, f, 0);
1546
+ __m128 x0, s0 = _mm_load_ps(src[0] + i);
1547
+ s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1548
+
1549
+ for( k = 1; k <= ksize2; k++ )
1550
+ {
1551
+ f = _mm_load_ss(ky+k);
1552
+ f = _mm_shuffle_ps(f, f, 0);
1553
+ S = src[k] + i;
1554
+ S2 = src[-k] + i;
1555
+ x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1556
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1557
+ }
1558
+
1559
+ _mm_storeu_ps(dst + i, s0);
1560
+ }
1561
+ }
1562
+ else
1563
+ {
1564
+ for( ; i <= width - 16; i += 16 )
1565
+ {
1566
+ __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1567
+ __m128 x0, x1;
1568
+ S = src[0] + i;
1569
+
1570
+ for( k = 1; k <= ksize2; k++ )
1571
+ {
1572
+ S = src[k] + i;
1573
+ S2 = src[-k] + i;
1574
+ f = _mm_load_ss(ky+k);
1575
+ f = _mm_shuffle_ps(f, f, 0);
1576
+ x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1577
+ x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1578
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1579
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1580
+ x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1581
+ x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1582
+ s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1583
+ s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1584
+ }
1585
+
1586
+ _mm_storeu_ps(dst + i, s0);
1587
+ _mm_storeu_ps(dst + i + 4, s1);
1588
+ _mm_storeu_ps(dst + i + 8, s2);
1589
+ _mm_storeu_ps(dst + i + 12, s3);
1590
+ }
1591
+
1592
+ for( ; i <= width - 4; i += 4 )
1593
+ {
1594
+ __m128 f, x0, s0 = d4;
1595
+
1596
+ for( k = 1; k <= ksize2; k++ )
1597
+ {
1598
+ f = _mm_load_ss(ky+k);
1599
+ f = _mm_shuffle_ps(f, f, 0);
1600
+ x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1601
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1602
+ }
1603
+
1604
+ _mm_storeu_ps(dst + i, s0);
1605
+ }
1606
+ }
1607
+
1608
+ return i;
1609
+ }
1610
+
1611
+ int symmetryType;
1612
+ float delta;
1613
+ Mat kernel;
1614
+ };
1615
+
1616
+
1617
+ struct SymmColumnSmallVec_32f
1618
+ {
1619
+ SymmColumnSmallVec_32f() { symmetryType=0; }
1620
+ SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1621
+ {
1622
+ symmetryType = _symmetryType;
1623
+ kernel = _kernel;
1624
+ delta = (float)_delta;
1625
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1626
+ }
1627
+
1628
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1629
+ {
1630
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1631
+ return 0;
1632
+
1633
+ int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1634
+ const float* ky = (const float*)kernel.data + ksize2;
1635
+ int i = 0;
1636
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1637
+ const float** src = (const float**)_src;
1638
+ const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1639
+ float* dst = (float*)_dst;
1640
+ __m128 d4 = _mm_set1_ps(delta);
1641
+
1642
+ if( symmetrical )
1643
+ {
1644
+ if( ky[0] == 2 && ky[1] == 1 )
1645
+ {
1646
+ for( ; i <= width - 8; i += 8 )
1647
+ {
1648
+ __m128 s0, s1, s2, s3, s4, s5;
1649
+ s0 = _mm_load_ps(S0 + i);
1650
+ s1 = _mm_load_ps(S0 + i + 4);
1651
+ s2 = _mm_load_ps(S1 + i);
1652
+ s3 = _mm_load_ps(S1 + i + 4);
1653
+ s4 = _mm_load_ps(S2 + i);
1654
+ s5 = _mm_load_ps(S2 + i + 4);
1655
+ s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1656
+ s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1657
+ s0 = _mm_add_ps(s0, d4);
1658
+ s1 = _mm_add_ps(s1, d4);
1659
+ _mm_storeu_ps(dst + i, s0);
1660
+ _mm_storeu_ps(dst + i + 4, s1);
1661
+ }
1662
+ }
1663
+ else if( ky[0] == -2 && ky[1] == 1 )
1664
+ {
1665
+ for( ; i <= width - 8; i += 8 )
1666
+ {
1667
+ __m128 s0, s1, s2, s3, s4, s5;
1668
+ s0 = _mm_load_ps(S0 + i);
1669
+ s1 = _mm_load_ps(S0 + i + 4);
1670
+ s2 = _mm_load_ps(S1 + i);
1671
+ s3 = _mm_load_ps(S1 + i + 4);
1672
+ s4 = _mm_load_ps(S2 + i);
1673
+ s5 = _mm_load_ps(S2 + i + 4);
1674
+ s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1675
+ s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1676
+ s0 = _mm_add_ps(s0, d4);
1677
+ s1 = _mm_add_ps(s1, d4);
1678
+ _mm_storeu_ps(dst + i, s0);
1679
+ _mm_storeu_ps(dst + i + 4, s1);
1680
+ }
1681
+ }
1682
+ else
1683
+ {
1684
+ __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1685
+ for( ; i <= width - 8; i += 8 )
1686
+ {
1687
+ __m128 s0, s1, x0, x1;
1688
+ s0 = _mm_load_ps(S1 + i);
1689
+ s1 = _mm_load_ps(S1 + i + 4);
1690
+ s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1691
+ s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1692
+ x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1693
+ x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1694
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1695
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1696
+ _mm_storeu_ps(dst + i, s0);
1697
+ _mm_storeu_ps(dst + i + 4, s1);
1698
+ }
1699
+ }
1700
+ }
1701
+ else
1702
+ {
1703
+ if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1704
+ {
1705
+ if( ky[1] < 0 )
1706
+ std::swap(S0, S2);
1707
+ for( ; i <= width - 8; i += 8 )
1708
+ {
1709
+ __m128 s0, s1, s2, s3;
1710
+ s0 = _mm_load_ps(S2 + i);
1711
+ s1 = _mm_load_ps(S2 + i + 4);
1712
+ s2 = _mm_load_ps(S0 + i);
1713
+ s3 = _mm_load_ps(S0 + i + 4);
1714
+ s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1715
+ s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1716
+ _mm_storeu_ps(dst + i, s0);
1717
+ _mm_storeu_ps(dst + i + 4, s1);
1718
+ }
1719
+ }
1720
+ else
1721
+ {
1722
+ __m128 k1 = _mm_set1_ps(ky[1]);
1723
+ for( ; i <= width - 8; i += 8 )
1724
+ {
1725
+ __m128 s0 = d4, s1 = d4, x0, x1;
1726
+ x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1727
+ x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1728
+ s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1729
+ s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1730
+ _mm_storeu_ps(dst + i, s0);
1731
+ _mm_storeu_ps(dst + i + 4, s1);
1732
+ }
1733
+ }
1734
+ }
1735
+
1736
+ return i;
1737
+ }
1738
+
1739
+ int symmetryType;
1740
+ float delta;
1741
+ Mat kernel;
1742
+ };
1743
+
1744
+
1745
+ /////////////////////////////// non-separable filters ///////////////////////////////
1746
+
1747
+ ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1748
+
1749
+ struct FilterVec_8u
1750
+ {
1751
+ FilterVec_8u() {}
1752
+ FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1753
+ {
1754
+ Mat kernel;
1755
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1756
+ delta = (float)(_delta/(1 << _bits));
1757
+ vector<Point> coords;
1758
+ preprocess2DKernel(kernel, coords, coeffs);
1759
+ _nz = (int)coords.size();
1760
+ }
1761
+
1762
+ int operator()(const uchar** src, uchar* dst, int width) const
1763
+ {
1764
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
1765
+ return 0;
1766
+
1767
+ const float* kf = (const float*)&coeffs[0];
1768
+ int i = 0, k, nz = _nz;
1769
+ __m128 d4 = _mm_set1_ps(delta);
1770
+
1771
+ for( ; i <= width - 16; i += 16 )
1772
+ {
1773
+ __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1774
+ __m128i x0, x1, z = _mm_setzero_si128();
1775
+
1776
+ for( k = 0; k < nz; k++ )
1777
+ {
1778
+ __m128 f = _mm_load_ss(kf+k), t0, t1;
1779
+ f = _mm_shuffle_ps(f, f, 0);
1780
+
1781
+ x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1782
+ x1 = _mm_unpackhi_epi8(x0, z);
1783
+ x0 = _mm_unpacklo_epi8(x0, z);
1784
+
1785
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1786
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1787
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1788
+ s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1789
+
1790
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1791
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1792
+ s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1793
+ s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1794
+ }
1795
+
1796
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1797
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1798
+ x0 = _mm_packus_epi16(x0, x1);
1799
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1800
+ }
1801
+
1802
+ for( ; i <= width - 4; i += 4 )
1803
+ {
1804
+ __m128 s0 = d4;
1805
+ __m128i x0, z = _mm_setzero_si128();
1806
+
1807
+ for( k = 0; k < nz; k++ )
1808
+ {
1809
+ __m128 f = _mm_load_ss(kf+k), t0;
1810
+ f = _mm_shuffle_ps(f, f, 0);
1811
+
1812
+ x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1813
+ x0 = _mm_unpacklo_epi8(x0, z);
1814
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1815
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1816
+ }
1817
+
1818
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1819
+ x0 = _mm_packus_epi16(x0, x0);
1820
+ *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1821
+ }
1822
+
1823
+ return i;
1824
+ }
1825
+
1826
+ int _nz;
1827
+ vector<uchar> coeffs;
1828
+ float delta;
1829
+ };
1830
+
1831
+
1832
+ struct FilterVec_8u16s
1833
+ {
1834
+ FilterVec_8u16s() {}
1835
+ FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
1836
+ {
1837
+ Mat kernel;
1838
+ _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1839
+ delta = (float)(_delta/(1 << _bits));
1840
+ vector<Point> coords;
1841
+ preprocess2DKernel(kernel, coords, coeffs);
1842
+ _nz = (int)coords.size();
1843
+ }
1844
+
1845
+ int operator()(const uchar** src, uchar* _dst, int width) const
1846
+ {
1847
+ if( !checkHardwareSupport(CV_CPU_SSE2) )
1848
+ return 0;
1849
+
1850
+ const float* kf = (const float*)&coeffs[0];
1851
+ short* dst = (short*)_dst;
1852
+ int i = 0, k, nz = _nz;
1853
+ __m128 d4 = _mm_set1_ps(delta);
1854
+
1855
+ for( ; i <= width - 16; i += 16 )
1856
+ {
1857
+ __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1858
+ __m128i x0, x1, z = _mm_setzero_si128();
1859
+
1860
+ for( k = 0; k < nz; k++ )
1861
+ {
1862
+ __m128 f = _mm_load_ss(kf+k), t0, t1;
1863
+ f = _mm_shuffle_ps(f, f, 0);
1864
+
1865
+ x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
1866
+ x1 = _mm_unpackhi_epi8(x0, z);
1867
+ x0 = _mm_unpacklo_epi8(x0, z);
1868
+
1869
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1870
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
1871
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1872
+ s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1873
+
1874
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
1875
+ t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
1876
+ s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1877
+ s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1878
+ }
1879
+
1880
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1881
+ x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1882
+ _mm_storeu_si128((__m128i*)(dst + i), x0);
1883
+ _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
1884
+ }
1885
+
1886
+ for( ; i <= width - 4; i += 4 )
1887
+ {
1888
+ __m128 s0 = d4;
1889
+ __m128i x0, z = _mm_setzero_si128();
1890
+
1891
+ for( k = 0; k < nz; k++ )
1892
+ {
1893
+ __m128 f = _mm_load_ss(kf+k), t0;
1894
+ f = _mm_shuffle_ps(f, f, 0);
1895
+
1896
+ x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
1897
+ x0 = _mm_unpacklo_epi8(x0, z);
1898
+ t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
1899
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1900
+ }
1901
+
1902
+ x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
1903
+ _mm_storel_epi64((__m128i*)(dst + i), x0);
1904
+ }
1905
+
1906
+ return i;
1907
+ }
1908
+
1909
+ int _nz;
1910
+ vector<uchar> coeffs;
1911
+ float delta;
1912
+ };
1913
+
1914
+
1915
+ struct FilterVec_32f
1916
+ {
1917
+ FilterVec_32f() {}
1918
+ FilterVec_32f(const Mat& _kernel, int, double _delta)
1919
+ {
1920
+ delta = (float)_delta;
1921
+ vector<Point> coords;
1922
+ preprocess2DKernel(_kernel, coords, coeffs);
1923
+ _nz = (int)coords.size();
1924
+ }
1925
+
1926
+ int operator()(const uchar** _src, uchar* _dst, int width) const
1927
+ {
1928
+ if( !checkHardwareSupport(CV_CPU_SSE) )
1929
+ return 0;
1930
+
1931
+ const float* kf = (const float*)&coeffs[0];
1932
+ const float** src = (const float**)_src;
1933
+ float* dst = (float*)_dst;
1934
+ int i = 0, k, nz = _nz;
1935
+ __m128 d4 = _mm_set1_ps(delta);
1936
+
1937
+ for( ; i <= width - 16; i += 16 )
1938
+ {
1939
+ __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1940
+
1941
+ for( k = 0; k < nz; k++ )
1942
+ {
1943
+ __m128 f = _mm_load_ss(kf+k), t0, t1;
1944
+ f = _mm_shuffle_ps(f, f, 0);
1945
+ const float* S = src[k] + i;
1946
+
1947
+ t0 = _mm_loadu_ps(S);
1948
+ t1 = _mm_loadu_ps(S + 4);
1949
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1950
+ s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
1951
+
1952
+ t0 = _mm_loadu_ps(S + 8);
1953
+ t1 = _mm_loadu_ps(S + 12);
1954
+ s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
1955
+ s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
1956
+ }
1957
+
1958
+ _mm_storeu_ps(dst + i, s0);
1959
+ _mm_storeu_ps(dst + i + 4, s1);
1960
+ _mm_storeu_ps(dst + i + 8, s2);
1961
+ _mm_storeu_ps(dst + i + 12, s3);
1962
+ }
1963
+
1964
+ for( ; i <= width - 4; i += 4 )
1965
+ {
1966
+ __m128 s0 = d4;
1967
+
1968
+ for( k = 0; k < nz; k++ )
1969
+ {
1970
+ __m128 f = _mm_load_ss(kf+k), t0;
1971
+ f = _mm_shuffle_ps(f, f, 0);
1972
+ t0 = _mm_loadu_ps(src[k] + i);
1973
+ s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
1974
+ }
1975
+ _mm_storeu_ps(dst + i, s0);
1976
+ }
1977
+
1978
+ return i;
1979
+ }
1980
+
1981
+ int _nz;
1982
+ vector<uchar> coeffs;
1983
+ float delta;
1984
+ };
1985
+
1986
+
1987
+ #else
1988
+
1989
+ typedef RowNoVec RowVec_8u32s;
1990
+ typedef RowNoVec RowVec_32f;
1991
+ typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
1992
+ typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
1993
+ typedef ColumnNoVec SymmColumnVec_32s8u;
1994
+ typedef ColumnNoVec SymmColumnVec_32f;
1995
+ typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
1996
+ typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
1997
+ typedef FilterNoVec FilterVec_8u;
1998
+ typedef FilterNoVec FilterVec_8u16s;
1999
+ typedef FilterNoVec FilterVec_32f;
2000
+
2001
+ #endif
2002
+
2003
+
2004
+ template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
2005
+ {
2006
+ RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
2007
+ {
2008
+ if( _kernel.isContinuous() )
2009
+ kernel = _kernel;
2010
+ else
2011
+ _kernel.copyTo(kernel);
2012
+ anchor = _anchor;
2013
+ ksize = kernel.rows + kernel.cols - 1;
2014
+ CV_Assert( kernel.type() == DataType<DT>::type &&
2015
+ (kernel.rows == 1 || kernel.cols == 1));
2016
+ vecOp = _vecOp;
2017
+ }
2018
+
2019
+ void operator()(const uchar* src, uchar* dst, int width, int cn)
2020
+ {
2021
+ int _ksize = ksize;
2022
+ const DT* kx = (const DT*)kernel.data;
2023
+ const ST* S;
2024
+ DT* D = (DT*)dst;
2025
+ int i, k;
2026
+
2027
+ i = vecOp(src, dst, width, cn);
2028
+ width *= cn;
2029
+
2030
+ for( ; i <= width - 4; i += 4 )
2031
+ {
2032
+ S = (const ST*)src + i;
2033
+ DT f = kx[0];
2034
+ DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
2035
+
2036
+ for( k = 1; k < _ksize; k++ )
2037
+ {
2038
+ S += cn;
2039
+ f = kx[k];
2040
+ s0 += f*S[0]; s1 += f*S[1];
2041
+ s2 += f*S[2]; s3 += f*S[3];
2042
+ }
2043
+
2044
+ D[i] = s0; D[i+1] = s1;
2045
+ D[i+2] = s2; D[i+3] = s3;
2046
+ }
2047
+
2048
+ for( ; i < width; i++ )
2049
+ {
2050
+ S = (const ST*)src + i;
2051
+ DT s0 = kx[0]*S[0];
2052
+ for( k = 1; k < _ksize; k++ )
2053
+ {
2054
+ S += cn;
2055
+ s0 += kx[k]*S[0];
2056
+ }
2057
+ D[i] = s0;
2058
+ }
2059
+ }
2060
+
2061
+ Mat kernel;
2062
+ VecOp vecOp;
2063
+ };
2064
+
2065
+
2066
+ template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
2067
+ public RowFilter<ST, DT, VecOp>
2068
+ {
2069
+ SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
2070
+ const VecOp& _vecOp = VecOp())
2071
+ : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
2072
+ {
2073
+ symmetryType = _symmetryType;
2074
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
2075
+ }
2076
+
2077
+ void operator()(const uchar* src, uchar* dst, int width, int cn)
2078
+ {
2079
+ int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
2080
+ const DT* kx = (const DT*)this->kernel.data + ksize2;
2081
+ bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2082
+ DT* D = (DT*)dst;
2083
+ int i = this->vecOp(src, dst, width, cn), j, k;
2084
+ const ST* S = (const ST*)src + i + ksize2n;
2085
+ width *= cn;
2086
+
2087
+ if( symmetrical )
2088
+ {
2089
+ if( this->ksize == 1 && kx[0] == 1 )
2090
+ {
2091
+ for( ; i <= width - 2; i += 2 )
2092
+ {
2093
+ DT s0 = S[i], s1 = S[i+1];
2094
+ D[i] = s0; D[i+1] = s1;
2095
+ }
2096
+ S += i;
2097
+ }
2098
+ else if( this->ksize == 3 )
2099
+ {
2100
+ if( kx[0] == 2 && kx[1] == 1 )
2101
+ for( ; i <= width - 2; i += 2, S += 2 )
2102
+ {
2103
+ DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
2104
+ D[i] = s0; D[i+1] = s1;
2105
+ }
2106
+ else if( kx[0] == -2 && kx[1] == 1 )
2107
+ for( ; i <= width - 2; i += 2, S += 2 )
2108
+ {
2109
+ DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
2110
+ D[i] = s0; D[i+1] = s1;
2111
+ }
2112
+ else
2113
+ {
2114
+ DT k0 = kx[0], k1 = kx[1];
2115
+ for( ; i <= width - 2; i += 2, S += 2 )
2116
+ {
2117
+ DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
2118
+ D[i] = s0; D[i+1] = s1;
2119
+ }
2120
+ }
2121
+ }
2122
+ else if( this->ksize == 5 )
2123
+ {
2124
+ DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
2125
+ if( k0 == -2 && k1 == 0 && k2 == 1 )
2126
+ for( ; i <= width - 2; i += 2, S += 2 )
2127
+ {
2128
+ DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
2129
+ DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
2130
+ D[i] = s0; D[i+1] = s1;
2131
+ }
2132
+ else
2133
+ for( ; i <= width - 2; i += 2, S += 2 )
2134
+ {
2135
+ DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
2136
+ DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
2137
+ D[i] = s0; D[i+1] = s1;
2138
+ }
2139
+ }
2140
+
2141
+ for( ; i < width; i++, S++ )
2142
+ {
2143
+ DT s0 = kx[0]*S[0];
2144
+ for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2145
+ s0 += kx[k]*(S[j] + S[-j]);
2146
+ D[i] = s0;
2147
+ }
2148
+ }
2149
+ else
2150
+ {
2151
+ if( this->ksize == 3 )
2152
+ {
2153
+ if( kx[0] == 0 && kx[1] == 1 )
2154
+ for( ; i <= width - 2; i += 2, S += 2 )
2155
+ {
2156
+ DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
2157
+ D[i] = s0; D[i+1] = s1;
2158
+ }
2159
+ else
2160
+ {
2161
+ DT k1 = kx[1];
2162
+ for( ; i <= width - 2; i += 2, S += 2 )
2163
+ {
2164
+ DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
2165
+ D[i] = s0; D[i+1] = s1;
2166
+ }
2167
+ }
2168
+ }
2169
+ else if( this->ksize == 5 )
2170
+ {
2171
+ DT k1 = kx[1], k2 = kx[2];
2172
+ for( ; i <= width - 2; i += 2, S += 2 )
2173
+ {
2174
+ DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
2175
+ DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
2176
+ D[i] = s0; D[i+1] = s1;
2177
+ }
2178
+ }
2179
+
2180
+ for( ; i < width; i++, S++ )
2181
+ {
2182
+ DT s0 = kx[0]*S[0];
2183
+ for( k = 1, j = cn; k <= ksize2; k++, j += cn )
2184
+ s0 += kx[k]*(S[j] - S[-j]);
2185
+ D[i] = s0;
2186
+ }
2187
+ }
2188
+ }
2189
+
2190
+ int symmetryType;
2191
+ };
2192
+
2193
+
2194
+ template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
2195
+ {
2196
+ typedef typename CastOp::type1 ST;
2197
+ typedef typename CastOp::rtype DT;
2198
+
2199
+ ColumnFilter( const Mat& _kernel, int _anchor,
2200
+ double _delta, const CastOp& _castOp=CastOp(),
2201
+ const VecOp& _vecOp=VecOp() )
2202
+ {
2203
+ if( _kernel.isContinuous() )
2204
+ kernel = _kernel;
2205
+ else
2206
+ _kernel.copyTo(kernel);
2207
+ anchor = _anchor;
2208
+ ksize = kernel.rows + kernel.cols - 1;
2209
+ delta = saturate_cast<ST>(_delta);
2210
+ castOp0 = _castOp;
2211
+ vecOp = _vecOp;
2212
+ CV_Assert( kernel.type() == DataType<ST>::type &&
2213
+ (kernel.rows == 1 || kernel.cols == 1));
2214
+ }
2215
+
2216
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2217
+ {
2218
+ const ST* ky = (const ST*)kernel.data;
2219
+ ST _delta = delta;
2220
+ int _ksize = ksize;
2221
+ int i, k;
2222
+ CastOp castOp = castOp0;
2223
+
2224
+ for( ; count--; dst += dststep, src++ )
2225
+ {
2226
+ DT* D = (DT*)dst;
2227
+ i = vecOp(src, dst, width);
2228
+ for( ; i <= width - 4; i += 4 )
2229
+ {
2230
+ ST f = ky[0];
2231
+ const ST* S = (const ST*)src[0] + i;
2232
+ ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2233
+ s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2234
+
2235
+ for( k = 1; k < _ksize; k++ )
2236
+ {
2237
+ S = (const ST*)src[k] + i; f = ky[k];
2238
+ s0 += f*S[0]; s1 += f*S[1];
2239
+ s2 += f*S[2]; s3 += f*S[3];
2240
+ }
2241
+
2242
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2243
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2244
+ }
2245
+
2246
+ for( ; i < width; i++ )
2247
+ {
2248
+ ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2249
+ for( k = 1; k < _ksize; k++ )
2250
+ s0 += ky[k]*((const ST*)src[k])[i];
2251
+ D[i] = castOp(s0);
2252
+ }
2253
+ }
2254
+ }
2255
+
2256
+ Mat kernel;
2257
+ CastOp castOp0;
2258
+ VecOp vecOp;
2259
+ ST delta;
2260
+ };
2261
+
2262
+
2263
+ template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
2264
+ {
2265
+ typedef typename CastOp::type1 ST;
2266
+ typedef typename CastOp::rtype DT;
2267
+
2268
+ SymmColumnFilter( const Mat& _kernel, int _anchor,
2269
+ double _delta, int _symmetryType,
2270
+ const CastOp& _castOp=CastOp(),
2271
+ const VecOp& _vecOp=VecOp())
2272
+ : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
2273
+ {
2274
+ symmetryType = _symmetryType;
2275
+ CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2276
+ }
2277
+
2278
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2279
+ {
2280
+ int ksize2 = this->ksize/2;
2281
+ const ST* ky = (const ST*)this->kernel.data + ksize2;
2282
+ int i, k;
2283
+ bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2284
+ ST _delta = this->delta;
2285
+ CastOp castOp = this->castOp0;
2286
+ src += ksize2;
2287
+
2288
+ if( symmetrical )
2289
+ {
2290
+ for( ; count--; dst += dststep, src++ )
2291
+ {
2292
+ DT* D = (DT*)dst;
2293
+ i = (this->vecOp)(src, dst, width);
2294
+
2295
+ for( ; i <= width - 4; i += 4 )
2296
+ {
2297
+ ST f = ky[0];
2298
+ const ST* S = (const ST*)src[0] + i, *S2;
2299
+ ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
2300
+ s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
2301
+
2302
+ for( k = 1; k <= ksize2; k++ )
2303
+ {
2304
+ S = (const ST*)src[k] + i;
2305
+ S2 = (const ST*)src[-k] + i;
2306
+ f = ky[k];
2307
+ s0 += f*(S[0] + S2[0]);
2308
+ s1 += f*(S[1] + S2[1]);
2309
+ s2 += f*(S[2] + S2[2]);
2310
+ s3 += f*(S[3] + S2[3]);
2311
+ }
2312
+
2313
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2314
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2315
+ }
2316
+
2317
+ for( ; i < width; i++ )
2318
+ {
2319
+ ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
2320
+ for( k = 1; k <= ksize2; k++ )
2321
+ s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
2322
+ D[i] = castOp(s0);
2323
+ }
2324
+ }
2325
+ }
2326
+ else
2327
+ {
2328
+ for( ; count--; dst += dststep, src++ )
2329
+ {
2330
+ DT* D = (DT*)dst;
2331
+ i = this->vecOp(src, dst, width);
2332
+
2333
+ for( ; i <= width - 4; i += 4 )
2334
+ {
2335
+ ST f = ky[0];
2336
+ const ST *S, *S2;
2337
+ ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2338
+
2339
+ for( k = 1; k <= ksize2; k++ )
2340
+ {
2341
+ S = (const ST*)src[k] + i;
2342
+ S2 = (const ST*)src[-k] + i;
2343
+ f = ky[k];
2344
+ s0 += f*(S[0] - S2[0]);
2345
+ s1 += f*(S[1] - S2[1]);
2346
+ s2 += f*(S[2] - S2[2]);
2347
+ s3 += f*(S[3] - S2[3]);
2348
+ }
2349
+
2350
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2351
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2352
+ }
2353
+
2354
+ for( ; i < width; i++ )
2355
+ {
2356
+ ST s0 = _delta;
2357
+ for( k = 1; k <= ksize2; k++ )
2358
+ s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
2359
+ D[i] = castOp(s0);
2360
+ }
2361
+ }
2362
+ }
2363
+ }
2364
+
2365
+ int symmetryType;
2366
+ };
2367
+
2368
+
2369
+ template<class CastOp, class VecOp>
2370
+ struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
2371
+ {
2372
+ typedef typename CastOp::type1 ST;
2373
+ typedef typename CastOp::rtype DT;
2374
+
2375
+ SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
2376
+ double _delta, int _symmetryType,
2377
+ const CastOp& _castOp=CastOp(),
2378
+ const VecOp& _vecOp=VecOp())
2379
+ : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
2380
+ {
2381
+ CV_Assert( this->ksize == 3 );
2382
+ }
2383
+
2384
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
2385
+ {
2386
+ int ksize2 = this->ksize/2;
2387
+ const ST* ky = (const ST*)this->kernel.data + ksize2;
2388
+ int i;
2389
+ bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
2390
+ bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
2391
+ bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
2392
+ bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
2393
+ ST f0 = ky[0], f1 = ky[1];
2394
+ ST _delta = this->delta;
2395
+ CastOp castOp = this->castOp0;
2396
+ src += ksize2;
2397
+
2398
+ for( ; count--; dst += dststep, src++ )
2399
+ {
2400
+ DT* D = (DT*)dst;
2401
+ i = (this->vecOp)(src, dst, width);
2402
+ const ST* S0 = (const ST*)src[-1];
2403
+ const ST* S1 = (const ST*)src[0];
2404
+ const ST* S2 = (const ST*)src[1];
2405
+
2406
+ if( symmetrical )
2407
+ {
2408
+ if( is_1_2_1 )
2409
+ {
2410
+ for( ; i <= width - 4; i += 4 )
2411
+ {
2412
+ ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
2413
+ ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
2414
+ D[i] = castOp(s0);
2415
+ D[i+1] = castOp(s1);
2416
+
2417
+ s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
2418
+ s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
2419
+ D[i+2] = castOp(s0);
2420
+ D[i+3] = castOp(s1);
2421
+ }
2422
+ }
2423
+ else if( is_1_m2_1 )
2424
+ {
2425
+ for( ; i <= width - 4; i += 4 )
2426
+ {
2427
+ ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
2428
+ ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
2429
+ D[i] = castOp(s0);
2430
+ D[i+1] = castOp(s1);
2431
+
2432
+ s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
2433
+ s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
2434
+ D[i+2] = castOp(s0);
2435
+ D[i+3] = castOp(s1);
2436
+ }
2437
+ }
2438
+ else
2439
+ {
2440
+ for( ; i <= width - 4; i += 4 )
2441
+ {
2442
+ ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
2443
+ ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
2444
+ D[i] = castOp(s0);
2445
+ D[i+1] = castOp(s1);
2446
+
2447
+ s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
2448
+ s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
2449
+ D[i+2] = castOp(s0);
2450
+ D[i+3] = castOp(s1);
2451
+ }
2452
+ }
2453
+
2454
+ for( ; i < width; i++ )
2455
+ D[i] = castOp((S0[i] + S2[i])*f1 + S1[i]*f0 + _delta);
2456
+ }
2457
+ else
2458
+ {
2459
+ if( is_m1_0_1 )
2460
+ {
2461
+ if( f1 < 0 )
2462
+ std::swap(S0, S2);
2463
+
2464
+ for( ; i <= width - 4; i += 4 )
2465
+ {
2466
+ ST s0 = S2[i] - S0[i] + _delta;
2467
+ ST s1 = S2[i+1] - S0[i+1] + _delta;
2468
+ D[i] = castOp(s0);
2469
+ D[i+1] = castOp(s1);
2470
+
2471
+ s0 = S2[i+2] - S0[i+2] + _delta;
2472
+ s1 = S2[i+3] - S0[i+3] + _delta;
2473
+ D[i+2] = castOp(s0);
2474
+ D[i+3] = castOp(s1);
2475
+ }
2476
+
2477
+ if( f1 < 0 )
2478
+ std::swap(S0, S2);
2479
+ }
2480
+ else
2481
+ {
2482
+ for( ; i <= width - 4; i += 4 )
2483
+ {
2484
+ ST s0 = (S2[i] - S0[i])*f1 + _delta;
2485
+ ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
2486
+ D[i] = castOp(s0);
2487
+ D[i+1] = castOp(s1);
2488
+
2489
+ s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
2490
+ s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
2491
+ D[i+2] = castOp(s0);
2492
+ D[i+3] = castOp(s1);
2493
+ }
2494
+ }
2495
+
2496
+ for( ; i < width; i++ )
2497
+ D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
2498
+ }
2499
+ }
2500
+ }
2501
+ };
2502
+
2503
+ template<typename ST, typename DT> struct Cast
2504
+ {
2505
+ typedef ST type1;
2506
+ typedef DT rtype;
2507
+
2508
+ DT operator()(ST val) const { return saturate_cast<DT>(val); }
2509
+ };
2510
+
2511
+ template<typename ST, typename DT, int bits> struct FixedPtCast
2512
+ {
2513
+ typedef ST type1;
2514
+ typedef DT rtype;
2515
+ enum { SHIFT = bits, DELTA = 1 << (bits-1) };
2516
+
2517
+ DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2518
+ };
2519
+
2520
+ template<typename ST, typename DT> struct FixedPtCastEx
2521
+ {
2522
+ typedef ST type1;
2523
+ typedef DT rtype;
2524
+
2525
+ FixedPtCastEx() : SHIFT(0), DELTA(0) {}
2526
+ FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
2527
+ DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
2528
+ int SHIFT, DELTA;
2529
+ };
2530
+
2531
+ }
2532
+
2533
+ cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
2534
+ InputArray _kernel, int anchor,
2535
+ int symmetryType )
2536
+ {
2537
+ Mat kernel = _kernel.getMat();
2538
+ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
2539
+ int cn = CV_MAT_CN(srcType);
2540
+ CV_Assert( cn == CV_MAT_CN(bufType) &&
2541
+ ddepth >= std::max(sdepth, CV_32S) &&
2542
+ kernel.type() == ddepth );
2543
+ int ksize = kernel.rows + kernel.cols - 1;
2544
+
2545
+ if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
2546
+ {
2547
+ if( sdepth == CV_8U && ddepth == CV_32S )
2548
+ return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
2549
+ (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
2550
+ if( sdepth == CV_32F && ddepth == CV_32F )
2551
+ return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
2552
+ (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
2553
+ }
2554
+
2555
+ if( sdepth == CV_8U && ddepth == CV_32S )
2556
+ return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
2557
+ (kernel, anchor, RowVec_8u32s(kernel)));
2558
+ if( sdepth == CV_8U && ddepth == CV_32F )
2559
+ return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
2560
+ if( sdepth == CV_8U && ddepth == CV_64F )
2561
+ return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
2562
+ if( sdepth == CV_16U && ddepth == CV_32F )
2563
+ return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
2564
+ if( sdepth == CV_16U && ddepth == CV_64F )
2565
+ return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
2566
+ if( sdepth == CV_16S && ddepth == CV_32F )
2567
+ return Ptr<BaseRowFilter>(new RowFilter<short, float, RowNoVec>(kernel, anchor));
2568
+ if( sdepth == CV_16S && ddepth == CV_64F )
2569
+ return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
2570
+ if( sdepth == CV_32F && ddepth == CV_32F )
2571
+ return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
2572
+ (kernel, anchor, RowVec_32f(kernel)));
2573
+ if( sdepth == CV_64F && ddepth == CV_64F )
2574
+ return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
2575
+
2576
+ CV_Error_( CV_StsNotImplemented,
2577
+ ("Unsupported combination of source format (=%d), and buffer format (=%d)",
2578
+ srcType, bufType));
2579
+
2580
+ return Ptr<BaseRowFilter>(0);
2581
+ }
2582
+
2583
+
2584
+ cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
2585
+ InputArray _kernel, int anchor,
2586
+ int symmetryType, double delta,
2587
+ int bits )
2588
+ {
2589
+ Mat kernel = _kernel.getMat();
2590
+ int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
2591
+ int cn = CV_MAT_CN(dstType);
2592
+ CV_Assert( cn == CV_MAT_CN(bufType) &&
2593
+ sdepth >= std::max(ddepth, CV_32S) &&
2594
+ kernel.type() == sdepth );
2595
+
2596
+ if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
2597
+ {
2598
+ if( ddepth == CV_8U && sdepth == CV_32S )
2599
+ return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
2600
+ (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
2601
+ if( ddepth == CV_8U && sdepth == CV_32F )
2602
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
2603
+ if( ddepth == CV_8U && sdepth == CV_64F )
2604
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
2605
+ if( ddepth == CV_16U && sdepth == CV_32F )
2606
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
2607
+ if( ddepth == CV_16U && sdepth == CV_64F )
2608
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
2609
+ if( ddepth == CV_16S && sdepth == CV_32F )
2610
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
2611
+ if( ddepth == CV_16S && sdepth == CV_64F )
2612
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
2613
+ if( ddepth == CV_32F && sdepth == CV_32F )
2614
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
2615
+ if( ddepth == CV_64F && sdepth == CV_64F )
2616
+ return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
2617
+ }
2618
+ else
2619
+ {
2620
+ int ksize = kernel.rows + kernel.cols - 1;
2621
+ if( ksize == 3 )
2622
+ {
2623
+ if( ddepth == CV_8U && sdepth == CV_32S )
2624
+ return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2625
+ FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2626
+ (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2627
+ SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2628
+ if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
2629
+ return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<Cast<int, short>,
2630
+ SymmColumnSmallVec_32s16s>(kernel, anchor, delta, symmetryType,
2631
+ Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
2632
+ if( ddepth == CV_32F && sdepth == CV_32F )
2633
+ return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter<
2634
+ Cast<float, float>,SymmColumnSmallVec_32f>
2635
+ (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2636
+ SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
2637
+ }
2638
+ if( ddepth == CV_8U && sdepth == CV_32S )
2639
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
2640
+ (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
2641
+ SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
2642
+ if( ddepth == CV_8U && sdepth == CV_32F )
2643
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
2644
+ (kernel, anchor, delta, symmetryType));
2645
+ if( ddepth == CV_8U && sdepth == CV_64F )
2646
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
2647
+ (kernel, anchor, delta, symmetryType));
2648
+ if( ddepth == CV_16U && sdepth == CV_32F )
2649
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
2650
+ (kernel, anchor, delta, symmetryType));
2651
+ if( ddepth == CV_16U && sdepth == CV_64F )
2652
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
2653
+ (kernel, anchor, delta, symmetryType));
2654
+ if( ddepth == CV_16S && sdepth == CV_32S )
2655
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
2656
+ (kernel, anchor, delta, symmetryType));
2657
+ if( ddepth == CV_16S && sdepth == CV_32F )
2658
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, ColumnNoVec>
2659
+ (kernel, anchor, delta, symmetryType));
2660
+ if( ddepth == CV_16S && sdepth == CV_64F )
2661
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
2662
+ (kernel, anchor, delta, symmetryType));
2663
+ if( ddepth == CV_32F && sdepth == CV_32F )
2664
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
2665
+ (kernel, anchor, delta, symmetryType, Cast<float, float>(),
2666
+ SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
2667
+ if( ddepth == CV_64F && sdepth == CV_64F )
2668
+ return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
2669
+ (kernel, anchor, delta, symmetryType));
2670
+ }
2671
+
2672
+ CV_Error_( CV_StsNotImplemented,
2673
+ ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
2674
+ bufType, dstType));
2675
+
2676
+ return Ptr<BaseColumnFilter>(0);
2677
+ }
2678
+
2679
+
2680
+ cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
2681
+ int _srcType, int _dstType,
2682
+ InputArray __rowKernel, InputArray __columnKernel,
2683
+ Point _anchor, double _delta,
2684
+ int _rowBorderType, int _columnBorderType,
2685
+ const Scalar& _borderValue )
2686
+ {
2687
+ Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
2688
+ _srcType = CV_MAT_TYPE(_srcType);
2689
+ _dstType = CV_MAT_TYPE(_dstType);
2690
+ int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2691
+ int cn = CV_MAT_CN(_srcType);
2692
+ CV_Assert( cn == CV_MAT_CN(_dstType) );
2693
+ int rsize = _rowKernel.rows + _rowKernel.cols - 1;
2694
+ int csize = _columnKernel.rows + _columnKernel.cols - 1;
2695
+ if( _anchor.x < 0 )
2696
+ _anchor.x = rsize/2;
2697
+ if( _anchor.y < 0 )
2698
+ _anchor.y = csize/2;
2699
+ int rtype = getKernelType(_rowKernel,
2700
+ _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
2701
+ int ctype = getKernelType(_columnKernel,
2702
+ _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
2703
+ Mat rowKernel, columnKernel;
2704
+
2705
+ int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
2706
+ int bits = 0;
2707
+
2708
+ if( sdepth == CV_8U &&
2709
+ ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2710
+ ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
2711
+ ddepth == CV_8U) ||
2712
+ ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2713
+ (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
2714
+ (rtype & ctype & KERNEL_INTEGER) &&
2715
+ ddepth == CV_16S)) )
2716
+ {
2717
+ bdepth = CV_32S;
2718
+ bits = ddepth == CV_8U ? 8 : 0;
2719
+ _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
2720
+ _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
2721
+ bits *= 2;
2722
+ _delta *= (1 << bits);
2723
+ }
2724
+ else
2725
+ {
2726
+ if( _rowKernel.type() != bdepth )
2727
+ _rowKernel.convertTo( rowKernel, bdepth );
2728
+ else
2729
+ rowKernel = _rowKernel;
2730
+ if( _columnKernel.type() != bdepth )
2731
+ _columnKernel.convertTo( columnKernel, bdepth );
2732
+ else
2733
+ columnKernel = _columnKernel;
2734
+ }
2735
+
2736
+ int _bufType = CV_MAKETYPE(bdepth, cn);
2737
+ Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
2738
+ _srcType, _bufType, rowKernel, _anchor.x, rtype);
2739
+ Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
2740
+ _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
2741
+
2742
+ return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
2743
+ _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
2744
+ }
2745
+
2746
+
2747
+ /****************************************************************************************\
2748
+ * Non-separable linear filter *
2749
+ \****************************************************************************************/
2750
+
2751
+ namespace cv
2752
+ {
2753
+
2754
+ void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs )
2755
+ {
2756
+ int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
2757
+ if(nz == 0)
2758
+ nz = 1;
2759
+ CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
2760
+ coords.resize(nz);
2761
+ coeffs.resize(nz*getElemSize(ktype));
2762
+ uchar* _coeffs = &coeffs[0];
2763
+
2764
+ for( i = k = 0; i < kernel.rows; i++ )
2765
+ {
2766
+ const uchar* krow = kernel.data + kernel.step*i;
2767
+ for( j = 0; j < kernel.cols; j++ )
2768
+ {
2769
+ if( ktype == CV_8U )
2770
+ {
2771
+ uchar val = krow[j];
2772
+ if( val == 0 )
2773
+ continue;
2774
+ coords[k] = Point(j,i);
2775
+ _coeffs[k++] = val;
2776
+ }
2777
+ else if( ktype == CV_32S )
2778
+ {
2779
+ int val = ((const int*)krow)[j];
2780
+ if( val == 0 )
2781
+ continue;
2782
+ coords[k] = Point(j,i);
2783
+ ((int*)_coeffs)[k++] = val;
2784
+ }
2785
+ else if( ktype == CV_32F )
2786
+ {
2787
+ float val = ((const float*)krow)[j];
2788
+ if( val == 0 )
2789
+ continue;
2790
+ coords[k] = Point(j,i);
2791
+ ((float*)_coeffs)[k++] = val;
2792
+ }
2793
+ else
2794
+ {
2795
+ double val = ((const double*)krow)[j];
2796
+ if( val == 0 )
2797
+ continue;
2798
+ coords[k] = Point(j,i);
2799
+ ((double*)_coeffs)[k++] = val;
2800
+ }
2801
+ }
2802
+ }
2803
+ }
2804
+
2805
+
2806
+ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
2807
+ {
2808
+ typedef typename CastOp::type1 KT;
2809
+ typedef typename CastOp::rtype DT;
2810
+
2811
+ Filter2D( const Mat& _kernel, Point _anchor,
2812
+ double _delta, const CastOp& _castOp=CastOp(),
2813
+ const VecOp& _vecOp=VecOp() )
2814
+ {
2815
+ anchor = _anchor;
2816
+ ksize = _kernel.size();
2817
+ delta = saturate_cast<KT>(_delta);
2818
+ castOp0 = _castOp;
2819
+ vecOp = _vecOp;
2820
+ CV_Assert( _kernel.type() == DataType<KT>::type );
2821
+ preprocess2DKernel( _kernel, coords, coeffs );
2822
+ ptrs.resize( coords.size() );
2823
+ }
2824
+
2825
+ void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
2826
+ {
2827
+ KT _delta = delta;
2828
+ const Point* pt = &coords[0];
2829
+ const KT* kf = (const KT*)&coeffs[0];
2830
+ const ST** kp = (const ST**)&ptrs[0];
2831
+ int i, k, nz = (int)coords.size();
2832
+ CastOp castOp = castOp0;
2833
+
2834
+ width *= cn;
2835
+ for( ; count > 0; count--, dst += dststep, src++ )
2836
+ {
2837
+ DT* D = (DT*)dst;
2838
+
2839
+ for( k = 0; k < nz; k++ )
2840
+ kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
2841
+
2842
+ i = vecOp((const uchar**)kp, dst, width);
2843
+
2844
+ for( ; i <= width - 4; i += 4 )
2845
+ {
2846
+ KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
2847
+
2848
+ for( k = 0; k < nz; k++ )
2849
+ {
2850
+ const ST* sptr = kp[k] + i;
2851
+ KT f = kf[k];
2852
+ s0 += f*sptr[0];
2853
+ s1 += f*sptr[1];
2854
+ s2 += f*sptr[2];
2855
+ s3 += f*sptr[3];
2856
+ }
2857
+
2858
+ D[i] = castOp(s0); D[i+1] = castOp(s1);
2859
+ D[i+2] = castOp(s2); D[i+3] = castOp(s3);
2860
+ }
2861
+
2862
+ for( ; i < width; i++ )
2863
+ {
2864
+ KT s0 = _delta;
2865
+ for( k = 0; k < nz; k++ )
2866
+ s0 += kf[k]*kp[k][i];
2867
+ D[i] = castOp(s0);
2868
+ }
2869
+ }
2870
+ }
2871
+
2872
+ vector<Point> coords;
2873
+ vector<uchar> coeffs;
2874
+ vector<uchar*> ptrs;
2875
+ KT delta;
2876
+ CastOp castOp0;
2877
+ VecOp vecOp;
2878
+ };
2879
+
2880
+ }
2881
+
2882
+ cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
2883
+ InputArray filter_kernel, Point anchor,
2884
+ double delta, int bits)
2885
+ {
2886
+ Mat _kernel = filter_kernel.getMat();
2887
+ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
2888
+ int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
2889
+ CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
2890
+
2891
+ anchor = normalizeAnchor(anchor, _kernel.size());
2892
+
2893
+ /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
2894
+ return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
2895
+ (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
2896
+ FilterVec_8u(_kernel, bits, delta)));
2897
+ if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
2898
+ return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
2899
+ (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
2900
+ FilterVec_8u16s(_kernel, bits, delta)));*/
2901
+
2902
+ kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
2903
+ Mat kernel;
2904
+ if( _kernel.type() == kdepth )
2905
+ kernel = _kernel;
2906
+ else
2907
+ _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
2908
+
2909
+ if( sdepth == CV_8U && ddepth == CV_8U )
2910
+ return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
2911
+ (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
2912
+ if( sdepth == CV_8U && ddepth == CV_16U )
2913
+ return Ptr<BaseFilter>(new Filter2D<uchar,
2914
+ Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2915
+ if( sdepth == CV_8U && ddepth == CV_16S )
2916
+ return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
2917
+ (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
2918
+ if( sdepth == CV_8U && ddepth == CV_32F )
2919
+ return Ptr<BaseFilter>(new Filter2D<uchar,
2920
+ Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2921
+ if( sdepth == CV_8U && ddepth == CV_64F )
2922
+ return Ptr<BaseFilter>(new Filter2D<uchar,
2923
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2924
+
2925
+ if( sdepth == CV_16U && ddepth == CV_16U )
2926
+ return Ptr<BaseFilter>(new Filter2D<ushort,
2927
+ Cast<float, ushort>, FilterNoVec>(kernel, anchor, delta));
2928
+ if( sdepth == CV_16U && ddepth == CV_32F )
2929
+ return Ptr<BaseFilter>(new Filter2D<ushort,
2930
+ Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2931
+ if( sdepth == CV_16U && ddepth == CV_64F )
2932
+ return Ptr<BaseFilter>(new Filter2D<ushort,
2933
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2934
+
2935
+ if( sdepth == CV_16S && ddepth == CV_16S )
2936
+ return Ptr<BaseFilter>(new Filter2D<short,
2937
+ Cast<float, short>, FilterNoVec>(kernel, anchor, delta));
2938
+ if( sdepth == CV_16S && ddepth == CV_32F )
2939
+ return Ptr<BaseFilter>(new Filter2D<short,
2940
+ Cast<float, float>, FilterNoVec>(kernel, anchor, delta));
2941
+ if( sdepth == CV_16S && ddepth == CV_64F )
2942
+ return Ptr<BaseFilter>(new Filter2D<short,
2943
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2944
+
2945
+ if( sdepth == CV_32F && ddepth == CV_32F )
2946
+ return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
2947
+ (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
2948
+ if( sdepth == CV_64F && ddepth == CV_64F )
2949
+ return Ptr<BaseFilter>(new Filter2D<double,
2950
+ Cast<double, double>, FilterNoVec>(kernel, anchor, delta));
2951
+
2952
+ CV_Error_( CV_StsNotImplemented,
2953
+ ("Unsupported combination of source format (=%d), and destination format (=%d)",
2954
+ srcType, dstType));
2955
+
2956
+ return Ptr<BaseFilter>(0);
2957
+ }
2958
+
2959
+
2960
+ cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
2961
+ InputArray filter_kernel,
2962
+ Point _anchor, double _delta,
2963
+ int _rowBorderType, int _columnBorderType,
2964
+ const Scalar& _borderValue )
2965
+ {
2966
+ Mat _kernel = filter_kernel.getMat();
2967
+ _srcType = CV_MAT_TYPE(_srcType);
2968
+ _dstType = CV_MAT_TYPE(_dstType);
2969
+ int cn = CV_MAT_CN(_srcType);
2970
+ CV_Assert( cn == CV_MAT_CN(_dstType) );
2971
+
2972
+ Mat kernel = _kernel;
2973
+ int bits = 0;
2974
+
2975
+ /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
2976
+ int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
2977
+ if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
2978
+ _kernel.rows*_kernel.cols <= (1 << 10) )
2979
+ {
2980
+ bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
2981
+ _kernel.convertTo(kernel, CV_32S, 1 << bits);
2982
+ }*/
2983
+
2984
+ Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
2985
+ kernel, _anchor, _delta, bits);
2986
+
2987
+ return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
2988
+ Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
2989
+ _rowBorderType, _columnBorderType, _borderValue ));
2990
+ }
2991
+
2992
+
2993
+ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
2994
+ InputArray _kernel, Point anchor,
2995
+ double delta, int borderType )
2996
+ {
2997
+ Mat src = _src.getMat(), kernel = _kernel.getMat();
2998
+
2999
+ if( ddepth < 0 )
3000
+ ddepth = src.depth();
3001
+
3002
+ #if CV_SSE2
3003
+ int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
3004
+ (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
3005
+ #else
3006
+ int dft_filter_size = 50;
3007
+ #endif
3008
+
3009
+ _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3010
+ Mat dst = _dst.getMat();
3011
+ anchor = normalizeAnchor(anchor, kernel.size());
3012
+
3013
+ if( kernel.cols*kernel.rows >= dft_filter_size )
3014
+ {
3015
+ Mat temp;
3016
+ if( src.data != dst.data )
3017
+ temp = dst;
3018
+ else
3019
+ temp.create(dst.size(), dst.type());
3020
+ crossCorr( src, kernel, temp, src.size(),
3021
+ CV_MAKETYPE(ddepth, src.channels()),
3022
+ anchor, delta, borderType );
3023
+ if( temp.data != dst.data )
3024
+ temp.copyTo(dst);
3025
+ return;
3026
+ }
3027
+
3028
+ Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
3029
+ anchor, delta, borderType );
3030
+ f->apply(src, dst);
3031
+ }
3032
+
3033
+
3034
+ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
3035
+ InputArray _kernelX, InputArray _kernelY, Point anchor,
3036
+ double delta, int borderType )
3037
+ {
3038
+ Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
3039
+
3040
+ if( ddepth < 0 )
3041
+ ddepth = src.depth();
3042
+
3043
+ _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
3044
+ Mat dst = _dst.getMat();
3045
+
3046
+ Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
3047
+ dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
3048
+ f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
3049
+ }
3050
+
3051
+
3052
+ CV_IMPL void
3053
+ cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
3054
+ {
3055
+ cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3056
+ cv::Mat kernel = cv::cvarrToMat(_kernel);
3057
+
3058
+ CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
3059
+
3060
+ cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
3061
+ }
3062
+
3063
+ /* End of file. */