@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-version.h +3 -3
  3. package/include/hwy/abort.h +2 -19
  4. package/include/hwy/aligned_allocator.h +11 -7
  5. package/include/hwy/auto_tune.h +504 -0
  6. package/include/hwy/base.h +425 -104
  7. package/include/hwy/cache_control.h +16 -0
  8. package/include/hwy/detect_compiler_arch.h +32 -1
  9. package/include/hwy/detect_targets.h +251 -67
  10. package/include/hwy/foreach_target.h +35 -0
  11. package/include/hwy/highway.h +185 -76
  12. package/include/hwy/nanobenchmark.h +1 -19
  13. package/include/hwy/ops/arm_neon-inl.h +969 -458
  14. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  15. package/include/hwy/ops/emu128-inl.h +97 -11
  16. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  17. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  18. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  19. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  20. package/include/hwy/ops/rvv-inl.h +546 -51
  21. package/include/hwy/ops/scalar-inl.h +77 -22
  22. package/include/hwy/ops/set_macros-inl.h +138 -17
  23. package/include/hwy/ops/shared-inl.h +50 -10
  24. package/include/hwy/ops/wasm_128-inl.h +137 -92
  25. package/include/hwy/ops/x86_128-inl.h +773 -214
  26. package/include/hwy/ops/x86_256-inl.h +712 -255
  27. package/include/hwy/ops/x86_512-inl.h +429 -753
  28. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  29. package/include/hwy/per_target.h +2 -1
  30. package/include/hwy/profiler.h +622 -486
  31. package/include/hwy/targets.h +62 -20
  32. package/include/hwy/timer-inl.h +8 -160
  33. package/include/hwy/timer.h +170 -3
  34. package/include/hwy/x86_cpuid.h +81 -0
  35. package/include/libheif/heif_cxx.h +25 -5
  36. package/include/libheif/heif_regions.h +5 -5
  37. package/include/libheif/heif_version.h +2 -2
  38. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  39. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  40. package/include/pango-1.0/pango/pango-features.h +3 -3
  41. package/include/pango-1.0/pango/pango-font.h +30 -0
  42. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  43. package/include/zlib.h +3 -3
  44. package/package.json +1 -1
  45. package/versions.json +8 -8
@@ -82,9 +82,17 @@ HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
82
82
 
83
83
  #endif // HWY_NO_LIBCXX
84
84
 
85
+ // Returns a string that satisfies gtest IsValidParamName(). No longer report
86
+ // targets as "Unknown" if they are for a different architecture, because some
87
+ // users unconditionally disable targets and we want to see which.
85
88
  static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
86
89
  switch (target) {
87
- #if HWY_ARCH_X86
90
+ case HWY_EMU128:
91
+ return "EMU128";
92
+ case HWY_SCALAR:
93
+ return "SCALAR";
94
+
95
+ // X86
88
96
  case HWY_SSE2:
89
97
  return "SSE2";
90
98
  case HWY_SSSE3:
@@ -101,9 +109,10 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
101
109
  return "AVX3_ZEN4";
102
110
  case HWY_AVX3_SPR:
103
111
  return "AVX3_SPR";
104
- #endif
112
+ case HWY_AVX10_2:
113
+ return "AVX10_2";
105
114
 
106
- #if HWY_ARCH_ARM
115
+ // ARM
107
116
  case HWY_SVE2_128:
108
117
  return "SVE2_128";
109
118
  case HWY_SVE_256:
@@ -118,46 +127,71 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
118
127
  return "NEON";
119
128
  case HWY_NEON_WITHOUT_AES:
120
129
  return "NEON_WITHOUT_AES";
121
- #endif
122
130
 
123
- #if HWY_ARCH_PPC
131
+ // PPC
124
132
  case HWY_PPC8:
125
133
  return "PPC8";
126
134
  case HWY_PPC9:
127
135
  return "PPC9";
128
136
  case HWY_PPC10:
129
137
  return "PPC10";
130
- #endif
131
138
 
132
- #if HWY_ARCH_S390X
139
+ // S390X
133
140
  case HWY_Z14:
134
141
  return "Z14";
135
142
  case HWY_Z15:
136
143
  return "Z15";
137
- #endif
138
144
 
139
- #if HWY_ARCH_WASM
145
+ // WASM
140
146
  case HWY_WASM:
141
147
  return "WASM";
142
148
  case HWY_WASM_EMU256:
143
149
  return "WASM_EMU256";
144
- #endif
145
150
 
146
- #if HWY_ARCH_RISCV
151
+ // RISCV
147
152
  case HWY_RVV:
148
153
  return "RVV";
149
- #endif
150
-
151
- case HWY_EMU128:
152
- return "EMU128";
153
- case HWY_SCALAR:
154
- return "SCALAR";
155
154
 
156
- default:
157
- return "Unknown"; // must satisfy gtest IsValidParamName()
155
+ // LOONGARCH
156
+ case HWY_LSX:
157
+ return "LSX";
158
+ case HWY_LASX:
159
+ return "LASX";
158
160
  }
161
+
162
+ return "Unknown";
159
163
  }
160
164
 
165
+ // Invokes VISITOR(TARGET, NAMESPACE) for all enabled targets. Alphabetic order.
166
+ #define HWY_VISIT_TARGETS(VISITOR) \
167
+ HWY_VISIT_AVX10_2(VISITOR) \
168
+ HWY_VISIT_AVX2(VISITOR) \
169
+ HWY_VISIT_AVX3(VISITOR) \
170
+ HWY_VISIT_AVX3_DL(VISITOR) \
171
+ HWY_VISIT_AVX3_SPR(VISITOR) \
172
+ HWY_VISIT_AVX3_ZEN4(VISITOR) \
173
+ HWY_VISIT_FALLBACK(VISITOR) \
174
+ HWY_VISIT_LASX(VISITOR) \
175
+ HWY_VISIT_LSX(VISITOR) \
176
+ HWY_VISIT_NEON(VISITOR) \
177
+ HWY_VISIT_NEON_BF16(VISITOR) \
178
+ HWY_VISIT_NEON_WITHOUT_AES(VISITOR) \
179
+ HWY_VISIT_PPC10(VISITOR) \
180
+ HWY_VISIT_PPC8(VISITOR) \
181
+ HWY_VISIT_PPC9(VISITOR) \
182
+ HWY_VISIT_RVV(VISITOR) \
183
+ HWY_VISIT_SSE2(VISITOR) \
184
+ HWY_VISIT_SSE4(VISITOR) \
185
+ HWY_VISIT_SSSE3(VISITOR) \
186
+ HWY_VISIT_SVE(VISITOR) \
187
+ HWY_VISIT_SVE2(VISITOR) \
188
+ HWY_VISIT_SVE2_128(VISITOR) \
189
+ HWY_VISIT_SVE_256(VISITOR) \
190
+ HWY_VISIT_WASM(VISITOR) \
191
+ HWY_VISIT_WASM_EMU256(VISITOR) \
192
+ HWY_VISIT_Z14(VISITOR) \
193
+ HWY_VISIT_Z15(VISITOR)
194
+
161
195
  // The maximum number of dynamic targets on any architecture is defined by
162
196
  // HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
163
197
 
@@ -205,7 +239,7 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
205
239
  nullptr, /* reserved */ \
206
240
  nullptr, /* reserved */ \
207
241
  nullptr, /* reserved */ \
208
- nullptr, /* reserved */ \
242
+ HWY_CHOOSE_AVX10_2(func_name), /* AVX10_2 */ \
209
243
  HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \
210
244
  nullptr, /* reserved */ \
211
245
  HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \
@@ -284,6 +318,14 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
284
318
  HWY_CHOOSE_WASM(func_name), /* WASM */ \
285
319
  nullptr /* reserved */
286
320
 
321
+ #elif HWY_ARCH_LOONGARCH
322
+ #define HWY_MAX_DYNAMIC_TARGETS 3
323
+ #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_LOONGARCH
324
+ #define HWY_CHOOSE_TARGET_LIST(func_name) \
325
+ nullptr, /* reserved */ \
326
+ HWY_CHOOSE_LASX(func_name), /* LASX */ \
327
+ HWY_CHOOSE_LSX(func_name) /* LSX */
328
+
287
329
  #else
288
330
  // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
289
331
  // still creating single-entry tables in HWY_EXPORT to ensure portability.
@@ -13,9 +13,10 @@
13
13
  // See the License for the specific language governing permissions and
14
14
  // limitations under the License.
15
15
 
16
- // High-resolution and high-precision timer
16
+ // DEPRECATED, use timer.h instead.
17
+
18
+ #include "hwy/timer.h"
17
19
 
18
- // Per-target include guard
19
20
  #if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE)
20
21
  #ifdef HIGHWAY_HWY_TIMER_INL_H_
21
22
  #undef HIGHWAY_HWY_TIMER_INL_H_
@@ -25,170 +26,17 @@
25
26
 
26
27
  #include "hwy/highway.h"
27
28
 
28
- #if defined(_WIN32) || defined(_WIN64)
29
- #ifndef NOMINMAX
30
- #define NOMINMAX
31
- #endif // NOMINMAX
32
- #include <windows.h>
33
- #endif
34
-
35
- #if defined(__APPLE__)
36
- #include <mach/mach.h>
37
- #include <mach/mach_time.h>
38
- #endif
39
-
40
- #if defined(__HAIKU__)
41
- #include <OS.h>
42
- #endif
43
-
44
- #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
45
- #include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
46
- #endif
47
-
48
- #if HWY_ARCH_X86 && HWY_COMPILER_MSVC
49
- #include <intrin.h>
50
- #endif
51
-
52
- #include <stdint.h>
53
- #include <time.h> // clock_gettime
54
-
55
29
  HWY_BEFORE_NAMESPACE();
56
30
  namespace hwy {
57
31
  namespace HWY_NAMESPACE {
58
32
  namespace timer {
59
33
 
60
- // Ticks := platform-specific timer values (CPU cycles on x86). Must be
61
- // unsigned to guarantee wraparound on overflow.
62
- using Ticks = uint64_t;
34
+ // Deprecated aliases so that old code still compiles. Prefer to use
35
+ // `hwy::timer::*` from timer.h because that does not require highway.h.
36
+ using Ticks = hwy::timer::Ticks;
63
37
 
64
- // Start/Stop return absolute timestamps and must be placed immediately before
65
- // and after the region to measure. We provide separate Start/Stop functions
66
- // because they use different fences.
67
- //
68
- // Background: RDTSC is not 'serializing'; earlier instructions may complete
69
- // after it, and/or later instructions may complete before it. 'Fences' ensure
70
- // regions' elapsed times are independent of such reordering. The only
71
- // documented unprivileged serializing instruction is CPUID, which acts as a
72
- // full fence (no reordering across it in either direction). Unfortunately
73
- // the latency of CPUID varies wildly (perhaps made worse by not initializing
74
- // its EAX input). Because it cannot reliably be deducted from the region's
75
- // elapsed time, it must not be included in the region to measure (i.e.
76
- // between the two RDTSC).
77
- //
78
- // The newer RDTSCP is sometimes described as serializing, but it actually
79
- // only serves as a half-fence with release semantics. Although all
80
- // instructions in the region will complete before the final timestamp is
81
- // captured, subsequent instructions may leak into the region and increase the
82
- // elapsed time. Inserting another fence after the final RDTSCP would prevent
83
- // such reordering without affecting the measured region.
84
- //
85
- // Fortunately, such a fence exists. The LFENCE instruction is only documented
86
- // to delay later loads until earlier loads are visible. However, Intel's
87
- // reference manual says it acts as a full fence (waiting until all earlier
88
- // instructions have completed, and delaying later instructions until it
89
- // completes). AMD assigns the same behavior to MFENCE.
90
- //
91
- // We need a fence before the initial RDTSC to prevent earlier instructions
92
- // from leaking into the region, and arguably another after RDTSC to avoid
93
- // region instructions from completing before the timestamp is recorded.
94
- // When surrounded by fences, the additional RDTSCP half-fence provides no
95
- // benefit, so the initial timestamp can be recorded via RDTSC, which has
96
- // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
97
- // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
98
- //
99
- // Using Start+Start leads to higher variance and overhead than Stop+Stop.
100
- // However, Stop+Stop includes an LFENCE in the region measurements, which
101
- // adds a delay dependent on earlier loads. The combination of Start+Stop
102
- // is faster than Start+Start and more consistent than Stop+Stop because
103
- // the first LFENCE already delayed subsequent loads before the measured
104
- // region. This combination seems not to have been considered in prior work:
105
- // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
106
- //
107
- // Note: performance counters can measure 'exact' instructions-retired or
108
- // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
109
- // requires fences. Unfortunately, it is not accessible on all OSes and we
110
- // prefer to avoid kernel-mode drivers. Performance counters are also affected
111
- // by several under/over-count errata, so we use the TSC instead.
112
-
113
- // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
114
- // divide by InvariantTicksPerSecond.
115
- inline Ticks Start() {
116
- Ticks t;
117
- #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
118
- asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
119
- #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
120
- // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
121
- asm volatile("mrs %0, cntvct_el0" : "=r"(t));
122
- #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
123
- _ReadWriteBarrier();
124
- _mm_lfence();
125
- _ReadWriteBarrier();
126
- t = __rdtsc();
127
- _ReadWriteBarrier();
128
- _mm_lfence();
129
- _ReadWriteBarrier();
130
- #elif HWY_ARCH_X86_64
131
- asm volatile(
132
- "lfence\n\t"
133
- "rdtsc\n\t"
134
- "shl $32, %%rdx\n\t"
135
- "or %%rdx, %0\n\t"
136
- "lfence"
137
- : "=a"(t)
138
- :
139
- // "memory" avoids reordering. rdx = TSC >> 32.
140
- // "cc" = flags modified by SHL.
141
- : "rdx", "memory", "cc");
142
- #elif HWY_ARCH_RISCV
143
- asm volatile("fence; rdtime %0" : "=r"(t));
144
- #elif defined(_WIN32) || defined(_WIN64)
145
- LARGE_INTEGER counter;
146
- (void)QueryPerformanceCounter(&counter);
147
- t = counter.QuadPart;
148
- #elif defined(__APPLE__)
149
- t = mach_absolute_time();
150
- #elif defined(__HAIKU__)
151
- t = system_time_nsecs(); // since boot
152
- #else // POSIX
153
- timespec ts;
154
- clock_gettime(CLOCK_MONOTONIC, &ts);
155
- t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
156
- #endif
157
- return t;
158
- }
159
-
160
- // WARNING: on x86, caller must check HasRDTSCP before using this!
161
- inline Ticks Stop() {
162
- uint64_t t;
163
- #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
164
- asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
165
- #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
166
- // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
167
- asm volatile("mrs %0, cntvct_el0" : "=r"(t));
168
- #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
169
- _ReadWriteBarrier();
170
- unsigned aux;
171
- t = __rdtscp(&aux);
172
- _ReadWriteBarrier();
173
- _mm_lfence();
174
- _ReadWriteBarrier();
175
- #elif HWY_ARCH_X86_64
176
- // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
177
- asm volatile(
178
- "rdtscp\n\t"
179
- "shl $32, %%rdx\n\t"
180
- "or %%rdx, %0\n\t"
181
- "lfence"
182
- : "=a"(t)
183
- :
184
- // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
185
- // "cc" = flags modified by SHL.
186
- : "rcx", "rdx", "memory", "cc");
187
- #else
188
- t = Start();
189
- #endif
190
- return t;
191
- }
38
+ inline Ticks Start() { return hwy::timer::Start(); }
39
+ inline Ticks Stop() { return hwy::timer::Stop(); }
192
40
 
193
41
  } // namespace timer
194
42
 
@@ -17,11 +17,39 @@
17
17
  #define HIGHWAY_HWY_TIMER_H_
18
18
 
19
19
  // Platform-specific timer functions. Provides Now() and functions for
20
- // interpreting and converting the timer-inl.h Ticks.
20
+ // interpreting and converting Ticks.
21
21
 
22
22
  #include <stdint.h>
23
+ #include <time.h> // clock_gettime
23
24
 
24
- #include "hwy/highway_export.h"
25
+ #include "hwy/base.h"
26
+
27
+ #if defined(_WIN32) || defined(_WIN64)
28
+ #ifndef NOMINMAX
29
+ #define NOMINMAX
30
+ #endif // NOMINMAX
31
+ #ifndef WIN32_LEAN_AND_MEAN
32
+ #define WIN32_LEAN_AND_MEAN
33
+ #endif // WIN32_LEAN_AND_MEAN
34
+ #include <windows.h>
35
+ #endif
36
+
37
+ #if defined(__APPLE__)
38
+ #include <mach/mach.h>
39
+ #include <mach/mach_time.h>
40
+ #endif
41
+
42
+ #if defined(__HAIKU__)
43
+ #include <OS.h>
44
+ #endif
45
+
46
+ #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
47
+ #include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
48
+ #endif
49
+
50
+ #if HWY_ARCH_X86 && HWY_COMPILER_MSVC
51
+ #include <intrin.h>
52
+ #endif
25
53
 
26
54
  namespace hwy {
27
55
  namespace platform {
@@ -32,7 +60,7 @@ namespace platform {
32
60
  // Uses InvariantTicksPerSecond and the baseline version of timer::Start().
33
61
  HWY_DLLEXPORT double Now();
34
62
 
35
- // Functions for use with timer-inl.h:
63
+ // Functions related to `Ticks` below.
36
64
 
37
65
  // Returns whether it is safe to call timer::Stop without executing an illegal
38
66
  // instruction; if false, fills cpu100 (a pointer to a 100 character buffer)
@@ -65,6 +93,145 @@ static inline double SecondsSince(const Timestamp& t0) {
65
93
  return t1.t - t0.t;
66
94
  }
67
95
 
96
+ // Low-level Start/Stop functions, previously in timer-inl.h.
97
+
98
+ namespace timer {
99
+
100
+ // Ticks := platform-specific timer values (CPU cycles on x86). Must be
101
+ // unsigned to guarantee wraparound on overflow.
102
+ using Ticks = uint64_t;
103
+
104
+ // Start/Stop return absolute timestamps and must be placed immediately before
105
+ // and after the region to measure. We provide separate Start/Stop functions
106
+ // because they use different fences.
107
+ //
108
+ // Background: RDTSC is not 'serializing'; earlier instructions may complete
109
+ // after it, and/or later instructions may complete before it. 'Fences' ensure
110
+ // regions' elapsed times are independent of such reordering. The only
111
+ // documented unprivileged serializing instruction is CPUID, which acts as a
112
+ // full fence (no reordering across it in either direction). Unfortunately
113
+ // the latency of CPUID varies wildly (perhaps made worse by not initializing
114
+ // its EAX input). Because it cannot reliably be deducted from the region's
115
+ // elapsed time, it must not be included in the region to measure (i.e.
116
+ // between the two RDTSC).
117
+ //
118
+ // The newer RDTSCP is sometimes described as serializing, but it actually
119
+ // only serves as a half-fence with release semantics. Although all
120
+ // instructions in the region will complete before the final timestamp is
121
+ // captured, subsequent instructions may leak into the region and increase the
122
+ // elapsed time. Inserting another fence after the final `RDTSCP` would prevent
123
+ // such reordering without affecting the measured region.
124
+ //
125
+ // Fortunately, such a fence exists. The LFENCE instruction is only documented
126
+ // to delay later loads until earlier loads are visible. However, Intel's
127
+ // reference manual says it acts as a full fence (waiting until all earlier
128
+ // instructions have completed, and delaying later instructions until it
129
+ // completes). AMD assigns the same behavior to MFENCE.
130
+ //
131
+ // We need a fence before the initial RDTSC to prevent earlier instructions
132
+ // from leaking into the region, and arguably another after RDTSC to avoid
133
+ // region instructions from completing before the timestamp is recorded.
134
+ // When surrounded by fences, the additional `RDTSCP` half-fence provides no
135
+ // benefit, so the initial timestamp can be recorded via RDTSC, which has
136
+ // lower overhead than `RDTSCP` because it does not read TSC_AUX. In summary,
137
+ // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
138
+ //
139
+ // Using Start+Start leads to higher variance and overhead than Stop+Stop.
140
+ // However, Stop+Stop includes an LFENCE in the region measurements, which
141
+ // adds a delay dependent on earlier loads. The combination of Start+Stop
142
+ // is faster than Start+Start and more consistent than Stop+Stop because
143
+ // the first LFENCE already delayed subsequent loads before the measured
144
+ // region. This combination seems not to have been considered in prior work:
145
+ // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
146
+ //
147
+ // Note: performance counters can measure 'exact' instructions-retired or
148
+ // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
149
+ // requires fences. Unfortunately, it is not accessible on all OSes and we
150
+ // prefer to avoid kernel-mode drivers. Performance counters are also affected
151
+ // by several under/over-count errata, so we use the TSC instead.
152
+
153
+ // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
154
+ // divide by InvariantTicksPerSecond.
155
+ static HWY_INLINE Ticks Start() {
156
+ Ticks t;
157
+ #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
158
+ asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
159
+ #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
160
+ // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
161
+ asm volatile("mrs %0, cntvct_el0" : "=r"(t));
162
+ #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
163
+ _ReadWriteBarrier();
164
+ _mm_lfence();
165
+ _ReadWriteBarrier();
166
+ t = __rdtsc();
167
+ _ReadWriteBarrier();
168
+ _mm_lfence();
169
+ _ReadWriteBarrier();
170
+ #elif HWY_ARCH_X86_64
171
+ asm volatile(
172
+ "lfence\n\t"
173
+ "rdtsc\n\t"
174
+ "shl $32, %%rdx\n\t"
175
+ "or %%rdx, %0\n\t"
176
+ "lfence"
177
+ : "=a"(t)
178
+ :
179
+ // "memory" avoids reordering. rdx = TSC >> 32.
180
+ // "cc" = flags modified by SHL.
181
+ : "rdx", "memory", "cc");
182
+ #elif HWY_ARCH_RISCV
183
+ asm volatile("fence; rdtime %0" : "=r"(t));
184
+ #elif defined(_WIN32) || defined(_WIN64)
185
+ LARGE_INTEGER counter;
186
+ (void)QueryPerformanceCounter(&counter);
187
+ t = counter.QuadPart;
188
+ #elif defined(__APPLE__)
189
+ t = mach_absolute_time();
190
+ #elif defined(__HAIKU__)
191
+ t = system_time_nsecs(); // since boot
192
+ #else // POSIX
193
+ timespec ts;
194
+ clock_gettime(CLOCK_MONOTONIC, &ts);
195
+ t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
196
+ #endif
197
+ return t;
198
+ }
199
+
200
+ // WARNING: on x86, caller must check `HaveTimerStop()` before using this!
201
+ static HWY_INLINE Ticks Stop() {
202
+ uint64_t t;
203
+ #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
204
+ asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
205
+ #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
206
+ // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
207
+ asm volatile("mrs %0, cntvct_el0" : "=r"(t));
208
+ #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
209
+ _ReadWriteBarrier();
210
+ unsigned aux;
211
+ t = __rdtscp(&aux);
212
+ _ReadWriteBarrier();
213
+ _mm_lfence();
214
+ _ReadWriteBarrier();
215
+ #elif HWY_ARCH_X86_64
216
+ // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
217
+ asm volatile(
218
+ "rdtscp\n\t"
219
+ "shl $32, %%rdx\n\t"
220
+ "or %%rdx, %0\n\t"
221
+ "lfence"
222
+ : "=a"(t)
223
+ :
224
+ // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
225
+ // "cc" = flags modified by SHL.
226
+ : "rcx", "rdx", "memory", "cc");
227
+ #else
228
+ t = Start();
229
+ #endif
230
+ return t;
231
+ }
232
+
233
+ } // namespace timer
234
+
68
235
  } // namespace hwy
69
236
 
70
237
  #endif // HIGHWAY_HWY_TIMER_H_
@@ -0,0 +1,81 @@
1
+ // Copyright 2025 Google LLC
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+
16
+ #ifndef HIGHWAY_HWY_X86_CPUID_H_
17
+ #define HIGHWAY_HWY_X86_CPUID_H_
18
+
19
+ // Wrapper for x86 CPUID intrinsics. Empty on other platforms.
20
+
21
+ #include <stdint.h>
22
+
23
+ #include "hwy/base.h"
24
+
25
+ #if HWY_ARCH_X86
26
+
27
+ #if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
28
+ #include <intrin.h>
29
+ #else
30
+ #include <cpuid.h>
31
+ #endif
32
+
33
+ namespace hwy {
34
+ namespace x86 {
35
+
36
+ // Calls CPUID instruction with eax=level and ecx=count and returns the result
37
+ // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
38
+ static inline void Cpuid(const uint32_t level, const uint32_t count,
39
+ uint32_t* HWY_RESTRICT abcd) {
40
+ #if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
41
+ int regs[4];
42
+ __cpuidex(regs, static_cast<int>(level), static_cast<int>(count));
43
+ for (int i = 0; i < 4; ++i) {
44
+ abcd[i] = static_cast<uint32_t>(regs[i]);
45
+ }
46
+ #else // HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
47
+ uint32_t a;
48
+ uint32_t b;
49
+ uint32_t c;
50
+ uint32_t d;
51
+ __cpuid_count(level, count, a, b, c, d);
52
+ abcd[0] = a;
53
+ abcd[1] = b;
54
+ abcd[2] = c;
55
+ abcd[3] = d;
56
+ #endif // HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
57
+ }
58
+
59
+ static inline bool IsBitSet(const uint32_t reg, const int index) {
60
+ return (reg & (1U << index)) != 0;
61
+ }
62
+
63
+ static inline uint32_t MaxLevel() {
64
+ uint32_t abcd[4];
65
+ Cpuid(0, 0, abcd);
66
+ return abcd[0];
67
+ }
68
+
69
+ static inline bool IsAMD() {
70
+ uint32_t abcd[4];
71
+ Cpuid(0, 0, abcd);
72
+ const uint32_t max_level = abcd[0];
73
+ return max_level >= 1 && abcd[1] == 0x68747541 && abcd[2] == 0x444d4163 &&
74
+ abcd[3] == 0x69746e65;
75
+ }
76
+
77
+ } // namespace x86
78
+ } // namespace hwy
79
+
80
+ #endif // HWY_ARCH_X86
81
+ #endif // HIGHWAY_HWY_X86_CPUID_H_
@@ -345,9 +345,15 @@ namespace heif {
345
345
 
346
346
  bool has_channel(enum heif_channel channel) const noexcept;
347
347
 
348
- const uint8_t* get_plane(enum heif_channel channel, size_t* out_stride) const noexcept;
348
+ // DEPRECATED
349
+ const uint8_t* get_plane(enum heif_channel channel, int* out_stride) const noexcept;
349
350
 
350
- uint8_t* get_plane(enum heif_channel channel, size_t* out_stride) noexcept;
351
+ // DEPRECATED
352
+ uint8_t* get_plane(enum heif_channel channel, int* out_stride) noexcept;
353
+
354
+ const uint8_t* get_plane2(enum heif_channel channel, size_t* out_stride) const noexcept;
355
+
356
+ uint8_t* get_plane2(enum heif_channel channel, size_t* out_stride) noexcept;
351
357
 
352
358
  // throws Error
353
359
  void set_nclx_color_profile(const ColorProfile_nclx&);
@@ -552,7 +558,11 @@ namespace heif {
552
558
  heif_reader_trampoline_get_position,
553
559
  heif_reader_trampoline_read,
554
560
  heif_reader_trampoline_seek,
555
- heif_reader_trampoline_wait_for_file_size
561
+ heif_reader_trampoline_wait_for_file_size,
562
+ NULL,
563
+ NULL,
564
+ NULL,
565
+ NULL,
556
566
  };
557
567
 
558
568
  inline void Context::read_from_reader(Reader& reader, const ReadingOptions& /*opts*/)
@@ -906,12 +916,22 @@ namespace heif {
906
916
  return heif_image_has_channel(m_image.get(), channel);
907
917
  }
908
918
 
909
- inline const uint8_t* Image::get_plane(enum heif_channel channel, size_t* out_stride) const noexcept
919
+ inline const uint8_t* Image::get_plane(enum heif_channel channel, int* out_stride) const noexcept
920
+ {
921
+ return heif_image_get_plane_readonly(m_image.get(), channel, out_stride);
922
+ }
923
+
924
+ inline uint8_t* Image::get_plane(enum heif_channel channel, int* out_stride) noexcept
925
+ {
926
+ return heif_image_get_plane(m_image.get(), channel, out_stride);
927
+ }
928
+
929
+ inline const uint8_t* Image::get_plane2(enum heif_channel channel, size_t* out_stride) const noexcept
910
930
  {
911
931
  return heif_image_get_plane_readonly2(m_image.get(), channel, out_stride);
912
932
  }
913
933
 
914
- inline uint8_t* Image::get_plane(enum heif_channel channel, size_t* out_stride) noexcept
934
+ inline uint8_t* Image::get_plane2(enum heif_channel channel, size_t* out_stride) noexcept
915
935
  {
916
936
  return heif_image_get_plane2(m_image.get(), channel, out_stride);
917
937
  }