@img/sharp-libvips-dev 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/cplusplus/VConnection.cpp +54 -54
- package/cplusplus/VError.cpp +20 -18
- package/cplusplus/VImage.cpp +636 -589
- package/cplusplus/VInterpolate.cpp +22 -22
- package/cplusplus/VRegion.cpp +4 -4
- package/cplusplus/vips-operators.cpp +2326 -2301
- package/include/aom/aom_codec.h +10 -6
- package/include/aom/aom_decoder.h +1 -1
- package/include/aom/aom_encoder.h +9 -2
- package/include/aom/aomcx.h +72 -3
- package/include/cairo/cairo-ft.h +1 -1
- package/include/cairo/cairo-gobject.h +8 -0
- package/include/cairo/cairo-svg.h +3 -3
- package/include/cairo/cairo-version.h +2 -2
- package/include/cairo/cairo.h +91 -24
- package/include/harfbuzz/hb-version.h +2 -2
- package/include/hwy/aligned_allocator.h +211 -0
- package/include/hwy/base.h +1517 -0
- package/include/hwy/cache_control.h +108 -0
- package/include/hwy/detect_compiler_arch.h +281 -0
- package/include/hwy/detect_targets.h +644 -0
- package/include/hwy/foreach_target.h +340 -0
- package/include/hwy/highway.h +435 -0
- package/include/hwy/highway_export.h +74 -0
- package/include/hwy/nanobenchmark.h +171 -0
- package/include/hwy/ops/arm_neon-inl.h +8913 -0
- package/include/hwy/ops/arm_sve-inl.h +5105 -0
- package/include/hwy/ops/emu128-inl.h +2811 -0
- package/include/hwy/ops/generic_ops-inl.h +4745 -0
- package/include/hwy/ops/ppc_vsx-inl.h +5716 -0
- package/include/hwy/ops/rvv-inl.h +5070 -0
- package/include/hwy/ops/scalar-inl.h +1995 -0
- package/include/hwy/ops/set_macros-inl.h +578 -0
- package/include/hwy/ops/shared-inl.h +539 -0
- package/include/hwy/ops/tuple-inl.h +125 -0
- package/include/hwy/ops/wasm_128-inl.h +5917 -0
- package/include/hwy/ops/x86_128-inl.h +11173 -0
- package/include/hwy/ops/x86_256-inl.h +7529 -0
- package/include/hwy/ops/x86_512-inl.h +6849 -0
- package/include/hwy/per_target.h +44 -0
- package/include/hwy/print-inl.h +62 -0
- package/include/hwy/print.h +75 -0
- package/include/hwy/robust_statistics.h +148 -0
- package/include/hwy/targets.h +338 -0
- package/include/hwy/timer-inl.h +200 -0
- package/include/hwy/timer.h +55 -0
- package/include/jconfig.h +2 -2
- package/include/jpeglib.h +3 -2
- package/include/libheif/heif.h +443 -377
- package/include/libheif/heif_cxx.h +4 -1
- package/include/libheif/heif_plugin.h +1 -1
- package/include/libheif/heif_properties.h +138 -0
- package/include/libheif/heif_regions.h +866 -0
- package/include/libheif/heif_version.h +3 -3
- package/include/vips/VConnection8.h +43 -49
- package/include/vips/VError8.h +27 -24
- package/include/vips/VImage8.h +4861 -4597
- package/include/vips/VInterpolate8.h +24 -27
- package/include/vips/VRegion8.h +32 -33
- package/include/vips/arithmetic.h +169 -169
- package/include/vips/basic.h +33 -33
- package/include/vips/buf.h +56 -54
- package/include/vips/colour.h +95 -95
- package/include/vips/connection.h +190 -193
- package/include/vips/conversion.h +91 -91
- package/include/vips/convolution.h +36 -30
- package/include/vips/create.h +63 -63
- package/include/vips/dbuf.h +35 -37
- package/include/vips/debug.h +65 -33
- package/include/vips/draw.h +41 -41
- package/include/vips/enumtypes.h +54 -51
- package/include/vips/error.h +63 -63
- package/include/vips/foreign.h +263 -223
- package/include/vips/format.h +48 -48
- package/include/vips/freqfilt.h +22 -22
- package/include/vips/gate.h +55 -47
- package/include/vips/generate.h +34 -34
- package/include/vips/header.h +111 -101
- package/include/vips/histogram.h +28 -28
- package/include/vips/image.h +213 -213
- package/include/vips/interpolate.h +40 -41
- package/include/vips/memory.h +61 -52
- package/include/vips/morphology.h +24 -24
- package/include/vips/mosaicing.h +32 -33
- package/include/vips/object.h +371 -357
- package/include/vips/operation.h +68 -67
- package/include/vips/private.h +76 -76
- package/include/vips/rect.h +26 -26
- package/include/vips/region.h +92 -92
- package/include/vips/resample.h +38 -38
- package/include/vips/sbuf.h +53 -54
- package/include/vips/semaphore.h +24 -24
- package/include/vips/thread.h +30 -27
- package/include/vips/threadpool.h +48 -49
- package/include/vips/transform.h +39 -39
- package/include/vips/type.h +90 -85
- package/include/vips/util.h +274 -229
- package/include/vips/vector.h +24 -144
- package/include/vips/version.h +9 -9
- package/include/vips/vips.h +41 -40
- package/package.json +1 -1
- package/versions.json +7 -7
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
// Copyright 2020 Google LLC
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
|
|
16
|
+
// Main header required before using vector types.
|
|
17
|
+
|
|
18
|
+
// IWYU pragma: begin_exports
|
|
19
|
+
#include "hwy/base.h"
|
|
20
|
+
#include "hwy/detect_compiler_arch.h"
|
|
21
|
+
#include "hwy/highway_export.h"
|
|
22
|
+
#include "hwy/targets.h"
|
|
23
|
+
// IWYU pragma: end_exports
|
|
24
|
+
|
|
25
|
+
// This include guard is checked by foreach_target, so avoid the usual _H_
|
|
26
|
+
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
|
|
27
|
+
// after/outside this include guard.
|
|
28
|
+
#ifndef HWY_HIGHWAY_INCLUDED
|
|
29
|
+
#define HWY_HIGHWAY_INCLUDED
|
|
30
|
+
|
|
31
|
+
namespace hwy {
|
|
32
|
+
|
|
33
|
+
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
|
34
|
+
#define HWY_MAJOR 1
|
|
35
|
+
#define HWY_MINOR 0
|
|
36
|
+
#define HWY_PATCH 7
|
|
37
|
+
|
|
38
|
+
//------------------------------------------------------------------------------
|
|
39
|
+
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
|
|
40
|
+
// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
|
|
41
|
+
// HWY_CAPPED(T, N).
|
|
42
|
+
|
|
43
|
+
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
|
|
44
|
+
// registers in the group, and is ignored on targets that do not support groups.
|
|
45
|
+
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
|
|
46
|
+
#define HWY_FULL2(T, LMUL) \
|
|
47
|
+
hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
|
|
48
|
+
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
|
|
49
|
+
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
|
|
50
|
+
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
|
|
51
|
+
// Trailing comma avoids -pedantic false alarm
|
|
52
|
+
#define HWY_CHOOSE_FULL(...) \
|
|
53
|
+
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
|
|
54
|
+
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
|
|
55
|
+
|
|
56
|
+
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
|
|
57
|
+
#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
|
|
58
|
+
|
|
59
|
+
//------------------------------------------------------------------------------
|
|
60
|
+
// Export user functions for static/dynamic dispatch
|
|
61
|
+
|
|
62
|
+
// Evaluates to 0 inside a translation unit if it is generating anything but the
|
|
63
|
+
// static target (the last one if multiple targets are enabled). Used to prevent
|
|
64
|
+
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
|
|
65
|
+
// compile once anyway, so this is 1 unless it is or has been included.
|
|
66
|
+
#ifndef HWY_ONCE
|
|
67
|
+
#define HWY_ONCE 1
|
|
68
|
+
#endif
|
|
69
|
+
|
|
70
|
+
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
|
|
71
|
+
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
|
|
72
|
+
// defined), and can be used to deduce the return type of Choose*.
|
|
73
|
+
#if HWY_STATIC_TARGET == HWY_SCALAR
|
|
74
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
|
|
75
|
+
#elif HWY_STATIC_TARGET == HWY_EMU128
|
|
76
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
|
|
77
|
+
#elif HWY_STATIC_TARGET == HWY_RVV
|
|
78
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
|
|
79
|
+
#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
|
|
80
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
|
|
81
|
+
#elif HWY_STATIC_TARGET == HWY_WASM
|
|
82
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
|
|
83
|
+
#elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES
|
|
84
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME
|
|
85
|
+
#elif HWY_STATIC_TARGET == HWY_NEON
|
|
86
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
|
|
87
|
+
#elif HWY_STATIC_TARGET == HWY_SVE
|
|
88
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
|
|
89
|
+
#elif HWY_STATIC_TARGET == HWY_SVE2
|
|
90
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
|
|
91
|
+
#elif HWY_STATIC_TARGET == HWY_SVE_256
|
|
92
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
|
|
93
|
+
#elif HWY_STATIC_TARGET == HWY_SVE2_128
|
|
94
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
|
|
95
|
+
#elif HWY_STATIC_TARGET == HWY_PPC8
|
|
96
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
|
|
97
|
+
#elif HWY_STATIC_TARGET == HWY_PPC9
|
|
98
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME
|
|
99
|
+
#elif HWY_STATIC_TARGET == HWY_PPC10
|
|
100
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME
|
|
101
|
+
#elif HWY_STATIC_TARGET == HWY_SSE2
|
|
102
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME
|
|
103
|
+
#elif HWY_STATIC_TARGET == HWY_SSSE3
|
|
104
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
|
|
105
|
+
#elif HWY_STATIC_TARGET == HWY_SSE4
|
|
106
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
|
|
107
|
+
#elif HWY_STATIC_TARGET == HWY_AVX2
|
|
108
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
|
|
109
|
+
#elif HWY_STATIC_TARGET == HWY_AVX3
|
|
110
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
|
|
111
|
+
#elif HWY_STATIC_TARGET == HWY_AVX3_DL
|
|
112
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
|
|
113
|
+
#elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4
|
|
114
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME
|
|
115
|
+
#elif HWY_STATIC_TARGET == HWY_AVX3_SPR
|
|
116
|
+
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME
|
|
117
|
+
#endif
|
|
118
|
+
|
|
119
|
+
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
|
|
120
|
+
// nullptr is that target was not compiled.
|
|
121
|
+
#if HWY_TARGETS & HWY_EMU128
|
|
122
|
+
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
|
|
123
|
+
#elif HWY_TARGETS & HWY_SCALAR
|
|
124
|
+
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
|
|
125
|
+
#else
|
|
126
|
+
// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
|
|
127
|
+
// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
|
|
128
|
+
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
|
129
|
+
#endif
|
|
130
|
+
|
|
131
|
+
#if HWY_TARGETS & HWY_WASM_EMU256
|
|
132
|
+
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
|
|
133
|
+
#else
|
|
134
|
+
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
|
|
135
|
+
#endif
|
|
136
|
+
|
|
137
|
+
#if HWY_TARGETS & HWY_WASM
|
|
138
|
+
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
|
|
139
|
+
#else
|
|
140
|
+
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
|
|
141
|
+
#endif
|
|
142
|
+
|
|
143
|
+
#if HWY_TARGETS & HWY_RVV
|
|
144
|
+
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
|
|
145
|
+
#else
|
|
146
|
+
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
|
|
147
|
+
#endif
|
|
148
|
+
|
|
149
|
+
#if HWY_TARGETS & HWY_NEON_WITHOUT_AES
|
|
150
|
+
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME
|
|
151
|
+
#else
|
|
152
|
+
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr
|
|
153
|
+
#endif
|
|
154
|
+
|
|
155
|
+
#if HWY_TARGETS & HWY_NEON
|
|
156
|
+
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
|
|
157
|
+
#else
|
|
158
|
+
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
|
|
159
|
+
#endif
|
|
160
|
+
|
|
161
|
+
#if HWY_TARGETS & HWY_SVE
|
|
162
|
+
#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
|
|
163
|
+
#else
|
|
164
|
+
#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
|
|
165
|
+
#endif
|
|
166
|
+
|
|
167
|
+
#if HWY_TARGETS & HWY_SVE2
|
|
168
|
+
#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
|
|
169
|
+
#else
|
|
170
|
+
#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
|
|
171
|
+
#endif
|
|
172
|
+
|
|
173
|
+
#if HWY_TARGETS & HWY_SVE_256
|
|
174
|
+
#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
|
|
175
|
+
#else
|
|
176
|
+
#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
|
|
177
|
+
#endif
|
|
178
|
+
|
|
179
|
+
#if HWY_TARGETS & HWY_SVE2_128
|
|
180
|
+
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
|
|
181
|
+
#else
|
|
182
|
+
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
|
|
183
|
+
#endif
|
|
184
|
+
|
|
185
|
+
#if HWY_TARGETS & HWY_PPC8
|
|
186
|
+
#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME
|
|
187
|
+
#else
|
|
188
|
+
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
|
|
189
|
+
#endif
|
|
190
|
+
|
|
191
|
+
#if HWY_TARGETS & HWY_PPC9
|
|
192
|
+
#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME
|
|
193
|
+
#else
|
|
194
|
+
#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr
|
|
195
|
+
#endif
|
|
196
|
+
|
|
197
|
+
#if HWY_TARGETS & HWY_PPC10
|
|
198
|
+
#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME
|
|
199
|
+
#else
|
|
200
|
+
#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr
|
|
201
|
+
#endif
|
|
202
|
+
|
|
203
|
+
#if HWY_TARGETS & HWY_SSE2
|
|
204
|
+
#define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME
|
|
205
|
+
#else
|
|
206
|
+
#define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr
|
|
207
|
+
#endif
|
|
208
|
+
|
|
209
|
+
#if HWY_TARGETS & HWY_SSSE3
|
|
210
|
+
#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
|
|
211
|
+
#else
|
|
212
|
+
#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
|
|
213
|
+
#endif
|
|
214
|
+
|
|
215
|
+
#if HWY_TARGETS & HWY_SSE4
|
|
216
|
+
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
|
|
217
|
+
#else
|
|
218
|
+
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
|
|
219
|
+
#endif
|
|
220
|
+
|
|
221
|
+
#if HWY_TARGETS & HWY_AVX2
|
|
222
|
+
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
|
|
223
|
+
#else
|
|
224
|
+
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
|
|
225
|
+
#endif
|
|
226
|
+
|
|
227
|
+
#if HWY_TARGETS & HWY_AVX3
|
|
228
|
+
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
|
|
229
|
+
#else
|
|
230
|
+
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
|
|
231
|
+
#endif
|
|
232
|
+
|
|
233
|
+
#if HWY_TARGETS & HWY_AVX3_DL
|
|
234
|
+
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
|
|
235
|
+
#else
|
|
236
|
+
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
|
|
237
|
+
#endif
|
|
238
|
+
|
|
239
|
+
#if HWY_TARGETS & HWY_AVX3_ZEN4
|
|
240
|
+
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME
|
|
241
|
+
#else
|
|
242
|
+
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr
|
|
243
|
+
#endif
|
|
244
|
+
|
|
245
|
+
#if HWY_TARGETS & HWY_AVX3_SPR
|
|
246
|
+
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME
|
|
247
|
+
#else
|
|
248
|
+
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr
|
|
249
|
+
#endif
|
|
250
|
+
|
|
251
|
+
// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
|
|
252
|
+
// apparently cannot be an array. Use a function pointer instead, which has the
|
|
253
|
+
// disadvantage that we call the static (not best) target on the first call to
|
|
254
|
+
// any HWY_DYNAMIC_DISPATCH.
|
|
255
|
+
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
|
|
256
|
+
#define HWY_DISPATCH_WORKAROUND 1
|
|
257
|
+
#else
|
|
258
|
+
#define HWY_DISPATCH_WORKAROUND 0
|
|
259
|
+
#endif
|
|
260
|
+
|
|
261
|
+
// Provides a static member function which is what is called during the first
|
|
262
|
+
// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
|
|
263
|
+
// this function are the first entry in the tables created by HWY_EXPORT.
|
|
264
|
+
template <typename RetType, typename... Args>
|
|
265
|
+
struct FunctionCache {
|
|
266
|
+
public:
|
|
267
|
+
typedef RetType(FunctionType)(Args...);
|
|
268
|
+
|
|
269
|
+
#if HWY_DISPATCH_WORKAROUND
|
|
270
|
+
template <FunctionType* const func>
|
|
271
|
+
static RetType ChooseAndCall(Args... args) {
|
|
272
|
+
ChosenTarget& chosen_target = GetChosenTarget();
|
|
273
|
+
chosen_target.Update(SupportedTargets());
|
|
274
|
+
return (*func)(args...);
|
|
275
|
+
}
|
|
276
|
+
#else
|
|
277
|
+
// A template function that when instantiated has the same signature as the
|
|
278
|
+
// function being called. This function initializes the bit array of targets
|
|
279
|
+
// supported by the current CPU and then calls the appropriate entry within
|
|
280
|
+
// the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
|
|
281
|
+
// exported functions, even those defined by different translation units,
|
|
282
|
+
// will dispatch directly to the best available target.
|
|
283
|
+
template <FunctionType* const table[]>
|
|
284
|
+
static RetType ChooseAndCall(Args... args) {
|
|
285
|
+
ChosenTarget& chosen_target = GetChosenTarget();
|
|
286
|
+
chosen_target.Update(SupportedTargets());
|
|
287
|
+
return (table[chosen_target.GetIndex()])(args...);
|
|
288
|
+
}
|
|
289
|
+
#endif // HWY_DISPATCH_WORKAROUND
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
// Used to deduce the template parameters RetType and Args from a function.
|
|
293
|
+
template <typename RetType, typename... Args>
|
|
294
|
+
FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
|
|
295
|
+
return FunctionCache<RetType, Args...>();
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
|
|
299
|
+
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
|
|
300
|
+
|
|
301
|
+
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
|
|
302
|
+
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
|
|
303
|
+
// static array must be defined at the same namespace level as the function
|
|
304
|
+
// it is exporting.
|
|
305
|
+
// After being exported, it can be called from other parts of the same source
|
|
306
|
+
// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
|
|
307
|
+
// like in the following example:
|
|
308
|
+
//
|
|
309
|
+
// #include "hwy/highway.h"
|
|
310
|
+
// HWY_BEFORE_NAMESPACE();
|
|
311
|
+
// namespace skeleton {
|
|
312
|
+
// namespace HWY_NAMESPACE {
|
|
313
|
+
//
|
|
314
|
+
// void MyFunction(int a, char b, const char* c) { ... }
|
|
315
|
+
//
|
|
316
|
+
// // NOLINTNEXTLINE(google-readability-namespace-comments)
|
|
317
|
+
// } // namespace HWY_NAMESPACE
|
|
318
|
+
// } // namespace skeleton
|
|
319
|
+
// HWY_AFTER_NAMESPACE();
|
|
320
|
+
//
|
|
321
|
+
// namespace skeleton {
|
|
322
|
+
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
|
|
323
|
+
//
|
|
324
|
+
// void MyFunction(int a, char b, const char* c) {
|
|
325
|
+
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
|
|
326
|
+
// }
|
|
327
|
+
// } // namespace skeleton
|
|
328
|
+
//
|
|
329
|
+
|
|
330
|
+
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
|
331
|
+
|
|
332
|
+
// Simplified version for IDE or the dynamic dispatch case with only one target.
|
|
333
|
+
// This case still uses a table, although of a single element, to provide the
|
|
334
|
+
// same compile error conditions as with the dynamic dispatch case when multiple
|
|
335
|
+
// targets are being compiled.
|
|
336
|
+
#define HWY_EXPORT(FUNC_NAME) \
|
|
337
|
+
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
|
|
338
|
+
HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
|
|
339
|
+
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
|
|
340
|
+
#define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
|
341
|
+
|
|
342
|
+
#else
|
|
343
|
+
|
|
344
|
+
// Simplified version for MSVC 2017: function pointer instead of table.
|
|
345
|
+
#if HWY_DISPATCH_WORKAROUND
|
|
346
|
+
|
|
347
|
+
#define HWY_EXPORT(FUNC_NAME) \
|
|
348
|
+
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
|
|
349
|
+
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
|
350
|
+
/* The first entry in the table initializes the global cache and \
|
|
351
|
+
* calls the function from HWY_STATIC_TARGET. */ \
|
|
352
|
+
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
|
|
353
|
+
FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
|
|
354
|
+
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
|
355
|
+
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
#else
|
|
359
|
+
|
|
360
|
+
// Dynamic dispatch case with one entry per dynamic target plus the fallback
|
|
361
|
+
// target and the initialization wrapper.
|
|
362
|
+
#define HWY_EXPORT(FUNC_NAME) \
|
|
363
|
+
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
|
|
364
|
+
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
|
365
|
+
/* The first entry in the table initializes the global cache and \
|
|
366
|
+
* calls the appropriate function. */ \
|
|
367
|
+
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
|
|
368
|
+
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
|
|
369
|
+
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
|
370
|
+
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
#endif // HWY_DISPATCH_WORKAROUND
|
|
374
|
+
|
|
375
|
+
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
|
|
376
|
+
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
|
|
377
|
+
#define HWY_DYNAMIC_POINTER(FUNC_NAME) \
|
|
378
|
+
(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])
|
|
379
|
+
|
|
380
|
+
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
|
381
|
+
|
|
382
|
+
// DEPRECATED names; please use HWY_HAVE_* instead.
|
|
383
|
+
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
|
|
384
|
+
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
|
|
385
|
+
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
|
|
386
|
+
|
|
387
|
+
} // namespace hwy
|
|
388
|
+
|
|
389
|
+
#endif // HWY_HIGHWAY_INCLUDED
|
|
390
|
+
|
|
391
|
+
//------------------------------------------------------------------------------
|
|
392
|
+
|
|
393
|
+
// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
|
|
394
|
+
// to include them once per target, which is ensured by the toggle check.
|
|
395
|
+
// Because ops/*.h are included under it, they do not need their own guard.
|
|
396
|
+
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
|
|
397
|
+
#ifdef HWY_HIGHWAY_PER_TARGET
|
|
398
|
+
#undef HWY_HIGHWAY_PER_TARGET
|
|
399
|
+
#else
|
|
400
|
+
#define HWY_HIGHWAY_PER_TARGET
|
|
401
|
+
#endif
|
|
402
|
+
|
|
403
|
+
// These define ops inside namespace hwy::HWY_NAMESPACE.
|
|
404
|
+
#if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
|
|
405
|
+
#include "hwy/ops/x86_128-inl.h"
|
|
406
|
+
#elif HWY_TARGET == HWY_AVX2
|
|
407
|
+
#include "hwy/ops/x86_256-inl.h"
|
|
408
|
+
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
|
|
409
|
+
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
|
|
410
|
+
#include "hwy/ops/x86_512-inl.h"
|
|
411
|
+
#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
|
|
412
|
+
HWY_TARGET == HWY_PPC10
|
|
413
|
+
#include "hwy/ops/ppc_vsx-inl.h"
|
|
414
|
+
#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
|
|
415
|
+
#include "hwy/ops/arm_neon-inl.h"
|
|
416
|
+
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
|
|
417
|
+
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
|
418
|
+
#include "hwy/ops/arm_sve-inl.h"
|
|
419
|
+
#elif HWY_TARGET == HWY_WASM_EMU256
|
|
420
|
+
#include "hwy/ops/wasm_256-inl.h"
|
|
421
|
+
#elif HWY_TARGET == HWY_WASM
|
|
422
|
+
#include "hwy/ops/wasm_128-inl.h"
|
|
423
|
+
#elif HWY_TARGET == HWY_RVV
|
|
424
|
+
#include "hwy/ops/rvv-inl.h"
|
|
425
|
+
#elif HWY_TARGET == HWY_EMU128
|
|
426
|
+
#include "hwy/ops/emu128-inl.h"
|
|
427
|
+
#elif HWY_TARGET == HWY_SCALAR
|
|
428
|
+
#include "hwy/ops/scalar-inl.h"
|
|
429
|
+
#else
|
|
430
|
+
#pragma message("HWY_TARGET does not match any known target")
|
|
431
|
+
#endif // HWY_TARGET
|
|
432
|
+
|
|
433
|
+
#include "hwy/ops/generic_ops-inl.h"
|
|
434
|
+
|
|
435
|
+
#endif // HWY_HIGHWAY_PER_TARGET
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
// Pseudo-generated file to handle both cmake & bazel build system.
|
|
2
|
+
|
|
3
|
+
// Initial generation done using cmake code:
|
|
4
|
+
// include(GenerateExportHeader)
|
|
5
|
+
// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
|
|
6
|
+
// hwy/highway_export.h)
|
|
7
|
+
// code reformatted using clang-format --style=Google
|
|
8
|
+
|
|
9
|
+
#ifndef HWY_DLLEXPORT_H
|
|
10
|
+
#define HWY_DLLEXPORT_H
|
|
11
|
+
|
|
12
|
+
#if !defined(HWY_SHARED_DEFINE)
|
|
13
|
+
#define HWY_DLLEXPORT
|
|
14
|
+
#define HWY_CONTRIB_DLLEXPORT
|
|
15
|
+
#define HWY_TEST_DLLEXPORT
|
|
16
|
+
#else // !HWY_SHARED_DEFINE
|
|
17
|
+
|
|
18
|
+
#ifndef HWY_DLLEXPORT
|
|
19
|
+
#if defined(hwy_EXPORTS)
|
|
20
|
+
/* We are building this library */
|
|
21
|
+
#ifdef _WIN32
|
|
22
|
+
#define HWY_DLLEXPORT __declspec(dllexport)
|
|
23
|
+
#else
|
|
24
|
+
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
|
25
|
+
#endif
|
|
26
|
+
#else // defined(hwy_EXPORTS)
|
|
27
|
+
/* We are using this library */
|
|
28
|
+
#ifdef _WIN32
|
|
29
|
+
#define HWY_DLLEXPORT __declspec(dllimport)
|
|
30
|
+
#else
|
|
31
|
+
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
|
32
|
+
#endif
|
|
33
|
+
#endif // defined(hwy_EXPORTS)
|
|
34
|
+
#endif // HWY_DLLEXPORT
|
|
35
|
+
|
|
36
|
+
#ifndef HWY_CONTRIB_DLLEXPORT
|
|
37
|
+
#if defined(hwy_contrib_EXPORTS)
|
|
38
|
+
/* We are building this library */
|
|
39
|
+
#ifdef _WIN32
|
|
40
|
+
#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
|
|
41
|
+
#else
|
|
42
|
+
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
|
43
|
+
#endif
|
|
44
|
+
#else // defined(hwy_contrib_EXPORTS)
|
|
45
|
+
/* We are using this library */
|
|
46
|
+
#ifdef _WIN32
|
|
47
|
+
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
|
|
48
|
+
#else
|
|
49
|
+
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
|
50
|
+
#endif
|
|
51
|
+
#endif // defined(hwy_contrib_EXPORTS)
|
|
52
|
+
#endif // HWY_CONTRIB_DLLEXPORT
|
|
53
|
+
|
|
54
|
+
#ifndef HWY_TEST_DLLEXPORT
|
|
55
|
+
#if defined(hwy_test_EXPORTS)
|
|
56
|
+
/* We are building this library */
|
|
57
|
+
#ifdef _WIN32
|
|
58
|
+
#define HWY_TEST_DLLEXPORT __declspec(dllexport)
|
|
59
|
+
#else
|
|
60
|
+
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
|
61
|
+
#endif
|
|
62
|
+
#else // defined(hwy_test_EXPORTS)
|
|
63
|
+
/* We are using this library */
|
|
64
|
+
#ifdef _WIN32
|
|
65
|
+
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
|
|
66
|
+
#else
|
|
67
|
+
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
|
68
|
+
#endif
|
|
69
|
+
#endif // defined(hwy_test_EXPORTS)
|
|
70
|
+
#endif // HWY_TEST_DLLEXPORT
|
|
71
|
+
|
|
72
|
+
#endif // !HWY_SHARED_DEFINE
|
|
73
|
+
|
|
74
|
+
#endif /* HWY_DLLEXPORT_H */
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
// Copyright 2019 Google LLC
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
|
|
16
|
+
#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
|
|
17
|
+
#define HIGHWAY_HWY_NANOBENCHMARK_H_
|
|
18
|
+
|
|
19
|
+
// Benchmarks functions of a single integer argument with realistic branch
|
|
20
|
+
// prediction hit rates. Uses a robust estimator to summarize the measurements.
|
|
21
|
+
// The precision is about 0.2%.
|
|
22
|
+
//
|
|
23
|
+
// Examples: see nanobenchmark_test.cc.
|
|
24
|
+
//
|
|
25
|
+
// Background: Microbenchmarks such as http://github.com/google/benchmark
|
|
26
|
+
// can measure elapsed times on the order of a microsecond. Shorter functions
|
|
27
|
+
// are typically measured by repeating them thousands of times and dividing
|
|
28
|
+
// the total elapsed time by this count. Unfortunately, repetition (especially
|
|
29
|
+
// with the same input parameter!) influences the runtime. In time-critical
|
|
30
|
+
// code, it is reasonable to expect warm instruction/data caches and TLBs,
|
|
31
|
+
// but a perfect record of which branches will be taken is unrealistic.
|
|
32
|
+
// Unless the application also repeatedly invokes the measured function with
|
|
33
|
+
// the same parameter, the benchmark is measuring something very different -
|
|
34
|
+
// a best-case result, almost as if the parameter were made a compile-time
|
|
35
|
+
// constant. This may lead to erroneous conclusions about branch-heavy
|
|
36
|
+
// algorithms outperforming branch-free alternatives.
|
|
37
|
+
//
|
|
38
|
+
// Our approach differs in three ways. Adding fences to the timer functions
|
|
39
|
+
// reduces variability due to instruction reordering, improving the timer
|
|
40
|
+
// resolution to about 40 CPU cycles. However, shorter functions must still
|
|
41
|
+
// be invoked repeatedly. For more realistic branch prediction performance,
|
|
42
|
+
// we vary the input parameter according to a user-specified distribution.
|
|
43
|
+
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
|
|
44
|
+
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
|
|
45
|
+
// central tendency of the measurement samples with the "half sample mode",
|
|
46
|
+
// which is more robust to outliers and skewed data than the mean or median.
|
|
47
|
+
|
|
48
|
+
#include <stddef.h>
|
|
49
|
+
#include <stdint.h>
|
|
50
|
+
|
|
51
|
+
#include "hwy/highway_export.h"
|
|
52
|
+
#include "hwy/timer.h"
|
|
53
|
+
|
|
54
|
+
// Enables sanity checks that verify correct operation at the cost of
|
|
55
|
+
// longer benchmark runs.
|
|
56
|
+
#ifndef NANOBENCHMARK_ENABLE_CHECKS
|
|
57
|
+
#define NANOBENCHMARK_ENABLE_CHECKS 0
|
|
58
|
+
#endif
|
|
59
|
+
|
|
60
|
+
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
|
|
61
|
+
while (!(condition)) { \
|
|
62
|
+
fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
|
|
63
|
+
abort(); \
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
#if NANOBENCHMARK_ENABLE_CHECKS
|
|
67
|
+
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
|
|
68
|
+
#else
|
|
69
|
+
#define NANOBENCHMARK_CHECK(condition)
|
|
70
|
+
#endif
|
|
71
|
+
|
|
72
|
+
namespace hwy {
|
|
73
|
+
|
|
74
|
+
// Returns 1, but without the compiler knowing what the value is. This prevents
|
|
75
|
+
// optimizing out code.
|
|
76
|
+
HWY_DLLEXPORT int Unpredictable1();
|
|
77
|
+
|
|
78
|
+
// Input influencing the function being measured (e.g. number of bytes to copy).
|
|
79
|
+
using FuncInput = size_t;
|
|
80
|
+
|
|
81
|
+
// "Proof of work" returned by Func to ensure the compiler does not elide it.
|
|
82
|
+
using FuncOutput = uint64_t;
|
|
83
|
+
|
|
84
|
+
// Function to measure: either 1) a captureless lambda or function with two
|
|
85
|
+
// arguments or 2) a lambda with capture, in which case the first argument
|
|
86
|
+
// is reserved for use by MeasureClosure.
|
|
87
|
+
using Func = FuncOutput (*)(const void*, FuncInput);
|
|
88
|
+
|
|
89
|
+
// Internal parameters that determine precision/resolution/measuring time.
|
|
90
|
+
struct Params {
|
|
91
|
+
// Best-case precision, expressed as a divisor of the timer resolution.
|
|
92
|
+
// Larger => more calls to Func and higher precision.
|
|
93
|
+
size_t precision_divisor = 1024;
|
|
94
|
+
|
|
95
|
+
// Ratio between full and subset input distribution sizes. Cannot be less
|
|
96
|
+
// than 2; larger values increase measurement time but more faithfully
|
|
97
|
+
// model the given input distribution.
|
|
98
|
+
size_t subset_ratio = 2;
|
|
99
|
+
|
|
100
|
+
// Together with the estimated Func duration, determines how many times to
|
|
101
|
+
// call Func before checking the sample variability. Larger values increase
|
|
102
|
+
// measurement time, memory/cache use and precision.
|
|
103
|
+
double seconds_per_eval = 4E-3;
|
|
104
|
+
|
|
105
|
+
// The minimum number of samples before estimating the central tendency.
|
|
106
|
+
size_t min_samples_per_eval = 7;
|
|
107
|
+
|
|
108
|
+
// The mode is better than median for estimating the central tendency of
|
|
109
|
+
// skewed/fat-tailed distributions, but it requires sufficient samples
|
|
110
|
+
// relative to the width of half-ranges.
|
|
111
|
+
size_t min_mode_samples = 64;
|
|
112
|
+
|
|
113
|
+
// Maximum permissible variability (= median absolute deviation / center).
|
|
114
|
+
double target_rel_mad = 0.002;
|
|
115
|
+
|
|
116
|
+
// Abort after this many evals without reaching target_rel_mad. This
|
|
117
|
+
// prevents infinite loops.
|
|
118
|
+
size_t max_evals = 9;
|
|
119
|
+
|
|
120
|
+
// Whether to print additional statistics to stdout.
|
|
121
|
+
bool verbose = true;
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
// Measurement result for each unique input.
|
|
125
|
+
struct Result {
|
|
126
|
+
FuncInput input;
|
|
127
|
+
|
|
128
|
+
// Robust estimate (mode or median) of duration.
|
|
129
|
+
float ticks;
|
|
130
|
+
|
|
131
|
+
// Measure of variability (median absolute deviation relative to "ticks").
|
|
132
|
+
float variability;
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
// Precisely measures the number of ticks elapsed when calling "func" with the
|
|
136
|
+
// given inputs, shuffled to ensure realistic branch prediction hit rates.
|
|
137
|
+
//
|
|
138
|
+
// "func" returns a 'proof of work' to ensure its computations are not elided.
|
|
139
|
+
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
|
|
140
|
+
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
|
|
141
|
+
// "func". The values should be chosen to maximize coverage of "func". This
|
|
142
|
+
// represents a distribution, so a value's frequency should reflect its
|
|
143
|
+
// probability in the real application. Order does not matter; for example, a
|
|
144
|
+
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
|
|
145
|
+
// Returns how many Result were written to "results": one per unique input, or
|
|
146
|
+
// zero if the measurement failed (an error message goes to stderr).
|
|
147
|
+
HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg,
|
|
148
|
+
const FuncInput* inputs, size_t num_inputs,
|
|
149
|
+
Result* results, const Params& p = Params());
|
|
150
|
+
|
|
151
|
+
// Calls operator() of the given closure (lambda function).
|
|
152
|
+
template <class Closure>
|
|
153
|
+
static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
|
|
154
|
+
return (*f)(input);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Same as Measure, except "closure" is typically a lambda function of
|
|
158
|
+
// FuncInput -> FuncOutput with a capture list.
|
|
159
|
+
template <class Closure>
|
|
160
|
+
static inline size_t MeasureClosure(const Closure& closure,
|
|
161
|
+
const FuncInput* inputs,
|
|
162
|
+
const size_t num_inputs, Result* results,
|
|
163
|
+
const Params& p = Params()) {
|
|
164
|
+
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
|
|
165
|
+
reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
|
|
166
|
+
results, p);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
} // namespace hwy
|
|
170
|
+
|
|
171
|
+
#endif // HIGHWAY_HWY_NANOBENCHMARK_H_
|