@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
package/include/hwy/targets.h
CHANGED
|
@@ -82,9 +82,17 @@ HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
|
|
|
82
82
|
|
|
83
83
|
#endif // HWY_NO_LIBCXX
|
|
84
84
|
|
|
85
|
+
// Returns a string that satisfies gtest IsValidParamName(). No longer report
|
|
86
|
+
// targets as "Unknown" if they are for a different architecture, because some
|
|
87
|
+
// users unconditionally disable targets and we want to see which.
|
|
85
88
|
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
|
86
89
|
switch (target) {
|
|
87
|
-
|
|
90
|
+
case HWY_EMU128:
|
|
91
|
+
return "EMU128";
|
|
92
|
+
case HWY_SCALAR:
|
|
93
|
+
return "SCALAR";
|
|
94
|
+
|
|
95
|
+
// X86
|
|
88
96
|
case HWY_SSE2:
|
|
89
97
|
return "SSE2";
|
|
90
98
|
case HWY_SSSE3:
|
|
@@ -101,9 +109,10 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
|
|
101
109
|
return "AVX3_ZEN4";
|
|
102
110
|
case HWY_AVX3_SPR:
|
|
103
111
|
return "AVX3_SPR";
|
|
104
|
-
|
|
112
|
+
case HWY_AVX10_2:
|
|
113
|
+
return "AVX10_2";
|
|
105
114
|
|
|
106
|
-
|
|
115
|
+
// ARM
|
|
107
116
|
case HWY_SVE2_128:
|
|
108
117
|
return "SVE2_128";
|
|
109
118
|
case HWY_SVE_256:
|
|
@@ -118,46 +127,71 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
|
|
118
127
|
return "NEON";
|
|
119
128
|
case HWY_NEON_WITHOUT_AES:
|
|
120
129
|
return "NEON_WITHOUT_AES";
|
|
121
|
-
#endif
|
|
122
130
|
|
|
123
|
-
|
|
131
|
+
// PPC
|
|
124
132
|
case HWY_PPC8:
|
|
125
133
|
return "PPC8";
|
|
126
134
|
case HWY_PPC9:
|
|
127
135
|
return "PPC9";
|
|
128
136
|
case HWY_PPC10:
|
|
129
137
|
return "PPC10";
|
|
130
|
-
#endif
|
|
131
138
|
|
|
132
|
-
|
|
139
|
+
// S390X
|
|
133
140
|
case HWY_Z14:
|
|
134
141
|
return "Z14";
|
|
135
142
|
case HWY_Z15:
|
|
136
143
|
return "Z15";
|
|
137
|
-
#endif
|
|
138
144
|
|
|
139
|
-
|
|
145
|
+
// WASM
|
|
140
146
|
case HWY_WASM:
|
|
141
147
|
return "WASM";
|
|
142
148
|
case HWY_WASM_EMU256:
|
|
143
149
|
return "WASM_EMU256";
|
|
144
|
-
#endif
|
|
145
150
|
|
|
146
|
-
|
|
151
|
+
// RISCV
|
|
147
152
|
case HWY_RVV:
|
|
148
153
|
return "RVV";
|
|
149
|
-
#endif
|
|
150
|
-
|
|
151
|
-
case HWY_EMU128:
|
|
152
|
-
return "EMU128";
|
|
153
|
-
case HWY_SCALAR:
|
|
154
|
-
return "SCALAR";
|
|
155
154
|
|
|
156
|
-
|
|
157
|
-
|
|
155
|
+
// LOONGARCH
|
|
156
|
+
case HWY_LSX:
|
|
157
|
+
return "LSX";
|
|
158
|
+
case HWY_LASX:
|
|
159
|
+
return "LASX";
|
|
158
160
|
}
|
|
161
|
+
|
|
162
|
+
return "Unknown";
|
|
159
163
|
}
|
|
160
164
|
|
|
165
|
+
// Invokes VISITOR(TARGET, NAMESPACE) for all enabled targets. Alphabetic order.
|
|
166
|
+
#define HWY_VISIT_TARGETS(VISITOR) \
|
|
167
|
+
HWY_VISIT_AVX10_2(VISITOR) \
|
|
168
|
+
HWY_VISIT_AVX2(VISITOR) \
|
|
169
|
+
HWY_VISIT_AVX3(VISITOR) \
|
|
170
|
+
HWY_VISIT_AVX3_DL(VISITOR) \
|
|
171
|
+
HWY_VISIT_AVX3_SPR(VISITOR) \
|
|
172
|
+
HWY_VISIT_AVX3_ZEN4(VISITOR) \
|
|
173
|
+
HWY_VISIT_FALLBACK(VISITOR) \
|
|
174
|
+
HWY_VISIT_LASX(VISITOR) \
|
|
175
|
+
HWY_VISIT_LSX(VISITOR) \
|
|
176
|
+
HWY_VISIT_NEON(VISITOR) \
|
|
177
|
+
HWY_VISIT_NEON_BF16(VISITOR) \
|
|
178
|
+
HWY_VISIT_NEON_WITHOUT_AES(VISITOR) \
|
|
179
|
+
HWY_VISIT_PPC10(VISITOR) \
|
|
180
|
+
HWY_VISIT_PPC8(VISITOR) \
|
|
181
|
+
HWY_VISIT_PPC9(VISITOR) \
|
|
182
|
+
HWY_VISIT_RVV(VISITOR) \
|
|
183
|
+
HWY_VISIT_SSE2(VISITOR) \
|
|
184
|
+
HWY_VISIT_SSE4(VISITOR) \
|
|
185
|
+
HWY_VISIT_SSSE3(VISITOR) \
|
|
186
|
+
HWY_VISIT_SVE(VISITOR) \
|
|
187
|
+
HWY_VISIT_SVE2(VISITOR) \
|
|
188
|
+
HWY_VISIT_SVE2_128(VISITOR) \
|
|
189
|
+
HWY_VISIT_SVE_256(VISITOR) \
|
|
190
|
+
HWY_VISIT_WASM(VISITOR) \
|
|
191
|
+
HWY_VISIT_WASM_EMU256(VISITOR) \
|
|
192
|
+
HWY_VISIT_Z14(VISITOR) \
|
|
193
|
+
HWY_VISIT_Z15(VISITOR)
|
|
194
|
+
|
|
161
195
|
// The maximum number of dynamic targets on any architecture is defined by
|
|
162
196
|
// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
|
|
163
197
|
|
|
@@ -205,7 +239,7 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
|
|
205
239
|
nullptr, /* reserved */ \
|
|
206
240
|
nullptr, /* reserved */ \
|
|
207
241
|
nullptr, /* reserved */ \
|
|
208
|
-
|
|
242
|
+
HWY_CHOOSE_AVX10_2(func_name), /* AVX10_2 */ \
|
|
209
243
|
HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \
|
|
210
244
|
nullptr, /* reserved */ \
|
|
211
245
|
HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \
|
|
@@ -284,6 +318,14 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
|
|
284
318
|
HWY_CHOOSE_WASM(func_name), /* WASM */ \
|
|
285
319
|
nullptr /* reserved */
|
|
286
320
|
|
|
321
|
+
#elif HWY_ARCH_LOONGARCH
|
|
322
|
+
#define HWY_MAX_DYNAMIC_TARGETS 3
|
|
323
|
+
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_LOONGARCH
|
|
324
|
+
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
|
325
|
+
nullptr, /* reserved */ \
|
|
326
|
+
HWY_CHOOSE_LASX(func_name), /* LASX */ \
|
|
327
|
+
HWY_CHOOSE_LSX(func_name) /* LSX */
|
|
328
|
+
|
|
287
329
|
#else
|
|
288
330
|
// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
|
|
289
331
|
// still creating single-entry tables in HWY_EXPORT to ensure portability.
|
package/include/hwy/timer-inl.h
CHANGED
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
// See the License for the specific language governing permissions and
|
|
14
14
|
// limitations under the License.
|
|
15
15
|
|
|
16
|
-
//
|
|
16
|
+
// DEPRECATED, use timer.h instead.
|
|
17
|
+
|
|
18
|
+
#include "hwy/timer.h"
|
|
17
19
|
|
|
18
|
-
// Per-target include guard
|
|
19
20
|
#if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
|
20
21
|
#ifdef HIGHWAY_HWY_TIMER_INL_H_
|
|
21
22
|
#undef HIGHWAY_HWY_TIMER_INL_H_
|
|
@@ -25,170 +26,17 @@
|
|
|
25
26
|
|
|
26
27
|
#include "hwy/highway.h"
|
|
27
28
|
|
|
28
|
-
#if defined(_WIN32) || defined(_WIN64)
|
|
29
|
-
#ifndef NOMINMAX
|
|
30
|
-
#define NOMINMAX
|
|
31
|
-
#endif // NOMINMAX
|
|
32
|
-
#include <windows.h>
|
|
33
|
-
#endif
|
|
34
|
-
|
|
35
|
-
#if defined(__APPLE__)
|
|
36
|
-
#include <mach/mach.h>
|
|
37
|
-
#include <mach/mach_time.h>
|
|
38
|
-
#endif
|
|
39
|
-
|
|
40
|
-
#if defined(__HAIKU__)
|
|
41
|
-
#include <OS.h>
|
|
42
|
-
#endif
|
|
43
|
-
|
|
44
|
-
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
|
45
|
-
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
|
|
46
|
-
#endif
|
|
47
|
-
|
|
48
|
-
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
49
|
-
#include <intrin.h>
|
|
50
|
-
#endif
|
|
51
|
-
|
|
52
|
-
#include <stdint.h>
|
|
53
|
-
#include <time.h> // clock_gettime
|
|
54
|
-
|
|
55
29
|
HWY_BEFORE_NAMESPACE();
|
|
56
30
|
namespace hwy {
|
|
57
31
|
namespace HWY_NAMESPACE {
|
|
58
32
|
namespace timer {
|
|
59
33
|
|
|
60
|
-
//
|
|
61
|
-
//
|
|
62
|
-
using Ticks =
|
|
34
|
+
// Deprecated aliases so that old code still compiles. Prefer to use
|
|
35
|
+
// `hwy::timer::*` from timer.h because that does not require highway.h.
|
|
36
|
+
using Ticks = hwy::timer::Ticks;
|
|
63
37
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
// because they use different fences.
|
|
67
|
-
//
|
|
68
|
-
// Background: RDTSC is not 'serializing'; earlier instructions may complete
|
|
69
|
-
// after it, and/or later instructions may complete before it. 'Fences' ensure
|
|
70
|
-
// regions' elapsed times are independent of such reordering. The only
|
|
71
|
-
// documented unprivileged serializing instruction is CPUID, which acts as a
|
|
72
|
-
// full fence (no reordering across it in either direction). Unfortunately
|
|
73
|
-
// the latency of CPUID varies wildly (perhaps made worse by not initializing
|
|
74
|
-
// its EAX input). Because it cannot reliably be deducted from the region's
|
|
75
|
-
// elapsed time, it must not be included in the region to measure (i.e.
|
|
76
|
-
// between the two RDTSC).
|
|
77
|
-
//
|
|
78
|
-
// The newer RDTSCP is sometimes described as serializing, but it actually
|
|
79
|
-
// only serves as a half-fence with release semantics. Although all
|
|
80
|
-
// instructions in the region will complete before the final timestamp is
|
|
81
|
-
// captured, subsequent instructions may leak into the region and increase the
|
|
82
|
-
// elapsed time. Inserting another fence after the final RDTSCP would prevent
|
|
83
|
-
// such reordering without affecting the measured region.
|
|
84
|
-
//
|
|
85
|
-
// Fortunately, such a fence exists. The LFENCE instruction is only documented
|
|
86
|
-
// to delay later loads until earlier loads are visible. However, Intel's
|
|
87
|
-
// reference manual says it acts as a full fence (waiting until all earlier
|
|
88
|
-
// instructions have completed, and delaying later instructions until it
|
|
89
|
-
// completes). AMD assigns the same behavior to MFENCE.
|
|
90
|
-
//
|
|
91
|
-
// We need a fence before the initial RDTSC to prevent earlier instructions
|
|
92
|
-
// from leaking into the region, and arguably another after RDTSC to avoid
|
|
93
|
-
// region instructions from completing before the timestamp is recorded.
|
|
94
|
-
// When surrounded by fences, the additional RDTSCP half-fence provides no
|
|
95
|
-
// benefit, so the initial timestamp can be recorded via RDTSC, which has
|
|
96
|
-
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
|
|
97
|
-
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
|
|
98
|
-
//
|
|
99
|
-
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
|
|
100
|
-
// However, Stop+Stop includes an LFENCE in the region measurements, which
|
|
101
|
-
// adds a delay dependent on earlier loads. The combination of Start+Stop
|
|
102
|
-
// is faster than Start+Start and more consistent than Stop+Stop because
|
|
103
|
-
// the first LFENCE already delayed subsequent loads before the measured
|
|
104
|
-
// region. This combination seems not to have been considered in prior work:
|
|
105
|
-
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
|
|
106
|
-
//
|
|
107
|
-
// Note: performance counters can measure 'exact' instructions-retired or
|
|
108
|
-
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
|
|
109
|
-
// requires fences. Unfortunately, it is not accessible on all OSes and we
|
|
110
|
-
// prefer to avoid kernel-mode drivers. Performance counters are also affected
|
|
111
|
-
// by several under/over-count errata, so we use the TSC instead.
|
|
112
|
-
|
|
113
|
-
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
|
|
114
|
-
// divide by InvariantTicksPerSecond.
|
|
115
|
-
inline Ticks Start() {
|
|
116
|
-
Ticks t;
|
|
117
|
-
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
|
118
|
-
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
|
119
|
-
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
|
120
|
-
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
|
121
|
-
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
|
122
|
-
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
123
|
-
_ReadWriteBarrier();
|
|
124
|
-
_mm_lfence();
|
|
125
|
-
_ReadWriteBarrier();
|
|
126
|
-
t = __rdtsc();
|
|
127
|
-
_ReadWriteBarrier();
|
|
128
|
-
_mm_lfence();
|
|
129
|
-
_ReadWriteBarrier();
|
|
130
|
-
#elif HWY_ARCH_X86_64
|
|
131
|
-
asm volatile(
|
|
132
|
-
"lfence\n\t"
|
|
133
|
-
"rdtsc\n\t"
|
|
134
|
-
"shl $32, %%rdx\n\t"
|
|
135
|
-
"or %%rdx, %0\n\t"
|
|
136
|
-
"lfence"
|
|
137
|
-
: "=a"(t)
|
|
138
|
-
:
|
|
139
|
-
// "memory" avoids reordering. rdx = TSC >> 32.
|
|
140
|
-
// "cc" = flags modified by SHL.
|
|
141
|
-
: "rdx", "memory", "cc");
|
|
142
|
-
#elif HWY_ARCH_RISCV
|
|
143
|
-
asm volatile("fence; rdtime %0" : "=r"(t));
|
|
144
|
-
#elif defined(_WIN32) || defined(_WIN64)
|
|
145
|
-
LARGE_INTEGER counter;
|
|
146
|
-
(void)QueryPerformanceCounter(&counter);
|
|
147
|
-
t = counter.QuadPart;
|
|
148
|
-
#elif defined(__APPLE__)
|
|
149
|
-
t = mach_absolute_time();
|
|
150
|
-
#elif defined(__HAIKU__)
|
|
151
|
-
t = system_time_nsecs(); // since boot
|
|
152
|
-
#else // POSIX
|
|
153
|
-
timespec ts;
|
|
154
|
-
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
155
|
-
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
|
|
156
|
-
#endif
|
|
157
|
-
return t;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
// WARNING: on x86, caller must check HasRDTSCP before using this!
|
|
161
|
-
inline Ticks Stop() {
|
|
162
|
-
uint64_t t;
|
|
163
|
-
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
|
164
|
-
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
|
165
|
-
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
|
166
|
-
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
|
167
|
-
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
|
168
|
-
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
169
|
-
_ReadWriteBarrier();
|
|
170
|
-
unsigned aux;
|
|
171
|
-
t = __rdtscp(&aux);
|
|
172
|
-
_ReadWriteBarrier();
|
|
173
|
-
_mm_lfence();
|
|
174
|
-
_ReadWriteBarrier();
|
|
175
|
-
#elif HWY_ARCH_X86_64
|
|
176
|
-
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
|
|
177
|
-
asm volatile(
|
|
178
|
-
"rdtscp\n\t"
|
|
179
|
-
"shl $32, %%rdx\n\t"
|
|
180
|
-
"or %%rdx, %0\n\t"
|
|
181
|
-
"lfence"
|
|
182
|
-
: "=a"(t)
|
|
183
|
-
:
|
|
184
|
-
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
|
|
185
|
-
// "cc" = flags modified by SHL.
|
|
186
|
-
: "rcx", "rdx", "memory", "cc");
|
|
187
|
-
#else
|
|
188
|
-
t = Start();
|
|
189
|
-
#endif
|
|
190
|
-
return t;
|
|
191
|
-
}
|
|
38
|
+
inline Ticks Start() { return hwy::timer::Start(); }
|
|
39
|
+
inline Ticks Stop() { return hwy::timer::Stop(); }
|
|
192
40
|
|
|
193
41
|
} // namespace timer
|
|
194
42
|
|
package/include/hwy/timer.h
CHANGED
|
@@ -17,11 +17,39 @@
|
|
|
17
17
|
#define HIGHWAY_HWY_TIMER_H_
|
|
18
18
|
|
|
19
19
|
// Platform-specific timer functions. Provides Now() and functions for
|
|
20
|
-
// interpreting and converting
|
|
20
|
+
// interpreting and converting Ticks.
|
|
21
21
|
|
|
22
22
|
#include <stdint.h>
|
|
23
|
+
#include <time.h> // clock_gettime
|
|
23
24
|
|
|
24
|
-
#include "hwy/
|
|
25
|
+
#include "hwy/base.h"
|
|
26
|
+
|
|
27
|
+
#if defined(_WIN32) || defined(_WIN64)
|
|
28
|
+
#ifndef NOMINMAX
|
|
29
|
+
#define NOMINMAX
|
|
30
|
+
#endif // NOMINMAX
|
|
31
|
+
#ifndef WIN32_LEAN_AND_MEAN
|
|
32
|
+
#define WIN32_LEAN_AND_MEAN
|
|
33
|
+
#endif // WIN32_LEAN_AND_MEAN
|
|
34
|
+
#include <windows.h>
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
#if defined(__APPLE__)
|
|
38
|
+
#include <mach/mach.h>
|
|
39
|
+
#include <mach/mach_time.h>
|
|
40
|
+
#endif
|
|
41
|
+
|
|
42
|
+
#if defined(__HAIKU__)
|
|
43
|
+
#include <OS.h>
|
|
44
|
+
#endif
|
|
45
|
+
|
|
46
|
+
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
|
47
|
+
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
|
|
48
|
+
#endif
|
|
49
|
+
|
|
50
|
+
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
51
|
+
#include <intrin.h>
|
|
52
|
+
#endif
|
|
25
53
|
|
|
26
54
|
namespace hwy {
|
|
27
55
|
namespace platform {
|
|
@@ -32,7 +60,7 @@ namespace platform {
|
|
|
32
60
|
// Uses InvariantTicksPerSecond and the baseline version of timer::Start().
|
|
33
61
|
HWY_DLLEXPORT double Now();
|
|
34
62
|
|
|
35
|
-
// Functions
|
|
63
|
+
// Functions related to `Ticks` below.
|
|
36
64
|
|
|
37
65
|
// Returns whether it is safe to call timer::Stop without executing an illegal
|
|
38
66
|
// instruction; if false, fills cpu100 (a pointer to a 100 character buffer)
|
|
@@ -65,6 +93,145 @@ static inline double SecondsSince(const Timestamp& t0) {
|
|
|
65
93
|
return t1.t - t0.t;
|
|
66
94
|
}
|
|
67
95
|
|
|
96
|
+
// Low-level Start/Stop functions, previously in timer-inl.h.
|
|
97
|
+
|
|
98
|
+
namespace timer {
|
|
99
|
+
|
|
100
|
+
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
|
|
101
|
+
// unsigned to guarantee wraparound on overflow.
|
|
102
|
+
using Ticks = uint64_t;
|
|
103
|
+
|
|
104
|
+
// Start/Stop return absolute timestamps and must be placed immediately before
|
|
105
|
+
// and after the region to measure. We provide separate Start/Stop functions
|
|
106
|
+
// because they use different fences.
|
|
107
|
+
//
|
|
108
|
+
// Background: RDTSC is not 'serializing'; earlier instructions may complete
|
|
109
|
+
// after it, and/or later instructions may complete before it. 'Fences' ensure
|
|
110
|
+
// regions' elapsed times are independent of such reordering. The only
|
|
111
|
+
// documented unprivileged serializing instruction is CPUID, which acts as a
|
|
112
|
+
// full fence (no reordering across it in either direction). Unfortunately
|
|
113
|
+
// the latency of CPUID varies wildly (perhaps made worse by not initializing
|
|
114
|
+
// its EAX input). Because it cannot reliably be deducted from the region's
|
|
115
|
+
// elapsed time, it must not be included in the region to measure (i.e.
|
|
116
|
+
// between the two RDTSC).
|
|
117
|
+
//
|
|
118
|
+
// The newer RDTSCP is sometimes described as serializing, but it actually
|
|
119
|
+
// only serves as a half-fence with release semantics. Although all
|
|
120
|
+
// instructions in the region will complete before the final timestamp is
|
|
121
|
+
// captured, subsequent instructions may leak into the region and increase the
|
|
122
|
+
// elapsed time. Inserting another fence after the final `RDTSCP` would prevent
|
|
123
|
+
// such reordering without affecting the measured region.
|
|
124
|
+
//
|
|
125
|
+
// Fortunately, such a fence exists. The LFENCE instruction is only documented
|
|
126
|
+
// to delay later loads until earlier loads are visible. However, Intel's
|
|
127
|
+
// reference manual says it acts as a full fence (waiting until all earlier
|
|
128
|
+
// instructions have completed, and delaying later instructions until it
|
|
129
|
+
// completes). AMD assigns the same behavior to MFENCE.
|
|
130
|
+
//
|
|
131
|
+
// We need a fence before the initial RDTSC to prevent earlier instructions
|
|
132
|
+
// from leaking into the region, and arguably another after RDTSC to avoid
|
|
133
|
+
// region instructions from completing before the timestamp is recorded.
|
|
134
|
+
// When surrounded by fences, the additional `RDTSCP` half-fence provides no
|
|
135
|
+
// benefit, so the initial timestamp can be recorded via RDTSC, which has
|
|
136
|
+
// lower overhead than `RDTSCP` because it does not read TSC_AUX. In summary,
|
|
137
|
+
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
|
|
138
|
+
//
|
|
139
|
+
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
|
|
140
|
+
// However, Stop+Stop includes an LFENCE in the region measurements, which
|
|
141
|
+
// adds a delay dependent on earlier loads. The combination of Start+Stop
|
|
142
|
+
// is faster than Start+Start and more consistent than Stop+Stop because
|
|
143
|
+
// the first LFENCE already delayed subsequent loads before the measured
|
|
144
|
+
// region. This combination seems not to have been considered in prior work:
|
|
145
|
+
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
|
|
146
|
+
//
|
|
147
|
+
// Note: performance counters can measure 'exact' instructions-retired or
|
|
148
|
+
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
|
|
149
|
+
// requires fences. Unfortunately, it is not accessible on all OSes and we
|
|
150
|
+
// prefer to avoid kernel-mode drivers. Performance counters are also affected
|
|
151
|
+
// by several under/over-count errata, so we use the TSC instead.
|
|
152
|
+
|
|
153
|
+
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
|
|
154
|
+
// divide by InvariantTicksPerSecond.
|
|
155
|
+
static HWY_INLINE Ticks Start() {
|
|
156
|
+
Ticks t;
|
|
157
|
+
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
|
158
|
+
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
|
159
|
+
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
|
160
|
+
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
|
161
|
+
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
|
162
|
+
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
163
|
+
_ReadWriteBarrier();
|
|
164
|
+
_mm_lfence();
|
|
165
|
+
_ReadWriteBarrier();
|
|
166
|
+
t = __rdtsc();
|
|
167
|
+
_ReadWriteBarrier();
|
|
168
|
+
_mm_lfence();
|
|
169
|
+
_ReadWriteBarrier();
|
|
170
|
+
#elif HWY_ARCH_X86_64
|
|
171
|
+
asm volatile(
|
|
172
|
+
"lfence\n\t"
|
|
173
|
+
"rdtsc\n\t"
|
|
174
|
+
"shl $32, %%rdx\n\t"
|
|
175
|
+
"or %%rdx, %0\n\t"
|
|
176
|
+
"lfence"
|
|
177
|
+
: "=a"(t)
|
|
178
|
+
:
|
|
179
|
+
// "memory" avoids reordering. rdx = TSC >> 32.
|
|
180
|
+
// "cc" = flags modified by SHL.
|
|
181
|
+
: "rdx", "memory", "cc");
|
|
182
|
+
#elif HWY_ARCH_RISCV
|
|
183
|
+
asm volatile("fence; rdtime %0" : "=r"(t));
|
|
184
|
+
#elif defined(_WIN32) || defined(_WIN64)
|
|
185
|
+
LARGE_INTEGER counter;
|
|
186
|
+
(void)QueryPerformanceCounter(&counter);
|
|
187
|
+
t = counter.QuadPart;
|
|
188
|
+
#elif defined(__APPLE__)
|
|
189
|
+
t = mach_absolute_time();
|
|
190
|
+
#elif defined(__HAIKU__)
|
|
191
|
+
t = system_time_nsecs(); // since boot
|
|
192
|
+
#else // POSIX
|
|
193
|
+
timespec ts;
|
|
194
|
+
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
195
|
+
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
|
|
196
|
+
#endif
|
|
197
|
+
return t;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// WARNING: on x86, caller must check `HaveTimerStop()` before using this!
|
|
201
|
+
static HWY_INLINE Ticks Stop() {
|
|
202
|
+
uint64_t t;
|
|
203
|
+
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
|
204
|
+
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
|
205
|
+
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
|
206
|
+
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
|
207
|
+
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
|
208
|
+
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
|
209
|
+
_ReadWriteBarrier();
|
|
210
|
+
unsigned aux;
|
|
211
|
+
t = __rdtscp(&aux);
|
|
212
|
+
_ReadWriteBarrier();
|
|
213
|
+
_mm_lfence();
|
|
214
|
+
_ReadWriteBarrier();
|
|
215
|
+
#elif HWY_ARCH_X86_64
|
|
216
|
+
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
|
|
217
|
+
asm volatile(
|
|
218
|
+
"rdtscp\n\t"
|
|
219
|
+
"shl $32, %%rdx\n\t"
|
|
220
|
+
"or %%rdx, %0\n\t"
|
|
221
|
+
"lfence"
|
|
222
|
+
: "=a"(t)
|
|
223
|
+
:
|
|
224
|
+
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
|
|
225
|
+
// "cc" = flags modified by SHL.
|
|
226
|
+
: "rcx", "rdx", "memory", "cc");
|
|
227
|
+
#else
|
|
228
|
+
t = Start();
|
|
229
|
+
#endif
|
|
230
|
+
return t;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
} // namespace timer
|
|
234
|
+
|
|
68
235
|
} // namespace hwy
|
|
69
236
|
|
|
70
237
|
#endif // HIGHWAY_HWY_TIMER_H_
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
// Copyright 2025 Google LLC
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
//
|
|
4
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
// you may not use this file except in compliance with the License.
|
|
6
|
+
// You may obtain a copy of the License at
|
|
7
|
+
//
|
|
8
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
//
|
|
10
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
// See the License for the specific language governing permissions and
|
|
14
|
+
// limitations under the License.
|
|
15
|
+
|
|
16
|
+
#ifndef HIGHWAY_HWY_X86_CPUID_H_
|
|
17
|
+
#define HIGHWAY_HWY_X86_CPUID_H_
|
|
18
|
+
|
|
19
|
+
// Wrapper for x86 CPUID intrinsics. Empty on other platforms.
|
|
20
|
+
|
|
21
|
+
#include <stdint.h>
|
|
22
|
+
|
|
23
|
+
#include "hwy/base.h"
|
|
24
|
+
|
|
25
|
+
#if HWY_ARCH_X86
|
|
26
|
+
|
|
27
|
+
#if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
|
|
28
|
+
#include <intrin.h>
|
|
29
|
+
#else
|
|
30
|
+
#include <cpuid.h>
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
namespace hwy {
|
|
34
|
+
namespace x86 {
|
|
35
|
+
|
|
36
|
+
// Calls CPUID instruction with eax=level and ecx=count and returns the result
|
|
37
|
+
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
|
|
38
|
+
static inline void Cpuid(const uint32_t level, const uint32_t count,
|
|
39
|
+
uint32_t* HWY_RESTRICT abcd) {
|
|
40
|
+
#if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
|
|
41
|
+
int regs[4];
|
|
42
|
+
__cpuidex(regs, static_cast<int>(level), static_cast<int>(count));
|
|
43
|
+
for (int i = 0; i < 4; ++i) {
|
|
44
|
+
abcd[i] = static_cast<uint32_t>(regs[i]);
|
|
45
|
+
}
|
|
46
|
+
#else // HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
|
|
47
|
+
uint32_t a;
|
|
48
|
+
uint32_t b;
|
|
49
|
+
uint32_t c;
|
|
50
|
+
uint32_t d;
|
|
51
|
+
__cpuid_count(level, count, a, b, c, d);
|
|
52
|
+
abcd[0] = a;
|
|
53
|
+
abcd[1] = b;
|
|
54
|
+
abcd[2] = c;
|
|
55
|
+
abcd[3] = d;
|
|
56
|
+
#endif // HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
static inline bool IsBitSet(const uint32_t reg, const int index) {
|
|
60
|
+
return (reg & (1U << index)) != 0;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
static inline uint32_t MaxLevel() {
|
|
64
|
+
uint32_t abcd[4];
|
|
65
|
+
Cpuid(0, 0, abcd);
|
|
66
|
+
return abcd[0];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static inline bool IsAMD() {
|
|
70
|
+
uint32_t abcd[4];
|
|
71
|
+
Cpuid(0, 0, abcd);
|
|
72
|
+
const uint32_t max_level = abcd[0];
|
|
73
|
+
return max_level >= 1 && abcd[1] == 0x68747541 && abcd[2] == 0x444d4163 &&
|
|
74
|
+
abcd[3] == 0x69746e65;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
} // namespace x86
|
|
78
|
+
} // namespace hwy
|
|
79
|
+
|
|
80
|
+
#endif // HWY_ARCH_X86
|
|
81
|
+
#endif // HIGHWAY_HWY_X86_CPUID_H_
|
|
@@ -345,9 +345,15 @@ namespace heif {
|
|
|
345
345
|
|
|
346
346
|
bool has_channel(enum heif_channel channel) const noexcept;
|
|
347
347
|
|
|
348
|
-
|
|
348
|
+
// DEPRECATED
|
|
349
|
+
const uint8_t* get_plane(enum heif_channel channel, int* out_stride) const noexcept;
|
|
349
350
|
|
|
350
|
-
|
|
351
|
+
// DEPRECATED
|
|
352
|
+
uint8_t* get_plane(enum heif_channel channel, int* out_stride) noexcept;
|
|
353
|
+
|
|
354
|
+
const uint8_t* get_plane2(enum heif_channel channel, size_t* out_stride) const noexcept;
|
|
355
|
+
|
|
356
|
+
uint8_t* get_plane2(enum heif_channel channel, size_t* out_stride) noexcept;
|
|
351
357
|
|
|
352
358
|
// throws Error
|
|
353
359
|
void set_nclx_color_profile(const ColorProfile_nclx&);
|
|
@@ -552,7 +558,11 @@ namespace heif {
|
|
|
552
558
|
heif_reader_trampoline_get_position,
|
|
553
559
|
heif_reader_trampoline_read,
|
|
554
560
|
heif_reader_trampoline_seek,
|
|
555
|
-
heif_reader_trampoline_wait_for_file_size
|
|
561
|
+
heif_reader_trampoline_wait_for_file_size,
|
|
562
|
+
NULL,
|
|
563
|
+
NULL,
|
|
564
|
+
NULL,
|
|
565
|
+
NULL,
|
|
556
566
|
};
|
|
557
567
|
|
|
558
568
|
inline void Context::read_from_reader(Reader& reader, const ReadingOptions& /*opts*/)
|
|
@@ -906,12 +916,22 @@ namespace heif {
|
|
|
906
916
|
return heif_image_has_channel(m_image.get(), channel);
|
|
907
917
|
}
|
|
908
918
|
|
|
909
|
-
inline const uint8_t* Image::get_plane(enum heif_channel channel,
|
|
919
|
+
inline const uint8_t* Image::get_plane(enum heif_channel channel, int* out_stride) const noexcept
|
|
920
|
+
{
|
|
921
|
+
return heif_image_get_plane_readonly(m_image.get(), channel, out_stride);
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
inline uint8_t* Image::get_plane(enum heif_channel channel, int* out_stride) noexcept
|
|
925
|
+
{
|
|
926
|
+
return heif_image_get_plane(m_image.get(), channel, out_stride);
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
inline const uint8_t* Image::get_plane2(enum heif_channel channel, size_t* out_stride) const noexcept
|
|
910
930
|
{
|
|
911
931
|
return heif_image_get_plane_readonly2(m_image.get(), channel, out_stride);
|
|
912
932
|
}
|
|
913
933
|
|
|
914
|
-
inline uint8_t* Image::
|
|
934
|
+
inline uint8_t* Image::get_plane2(enum heif_channel channel, size_t* out_stride) noexcept
|
|
915
935
|
{
|
|
916
936
|
return heif_image_get_plane2(m_image.get(), channel, out_stride);
|
|
917
937
|
}
|