@img/sharp-libvips-dev 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/include/expat.h +21 -10
  2. package/include/expat_config.h +11 -5
  3. package/include/ffi.h +12 -25
  4. package/include/freetype2/freetype/config/ftoption.h +1 -1
  5. package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
  6. package/include/glib-2.0/gio/gapplication.h +6 -0
  7. package/include/glib-2.0/gio/giotypes.h +0 -1
  8. package/include/glib-2.0/girepository/giarginfo.h +23 -6
  9. package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
  10. package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
  11. package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
  12. package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
  13. package/include/glib-2.0/girepository/gienuminfo.h +20 -21
  14. package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
  15. package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
  16. package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
  17. package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
  18. package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
  19. package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
  20. package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
  21. package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
  22. package/include/glib-2.0/girepository/girepository.h +53 -62
  23. package/include/glib-2.0/girepository/girffi.h +8 -7
  24. package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
  25. package/include/glib-2.0/girepository/gistructinfo.h +26 -11
  26. package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
  27. package/include/glib-2.0/girepository/gitypelib.h +9 -13
  28. package/include/glib-2.0/girepository/gitypes.h +52 -104
  29. package/include/glib-2.0/girepository/giunioninfo.h +28 -12
  30. package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
  31. package/include/glib-2.0/girepository/givalueinfo.h +65 -0
  32. package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
  33. package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
  34. package/include/glib-2.0/glib/gbitlock.h +31 -0
  35. package/include/glib-2.0/glib/gmessages.h +8 -0
  36. package/include/glib-2.0/glib/gslice.h +2 -0
  37. package/include/glib-2.0/glib/gstrfuncs.h +24 -18
  38. package/include/glib-2.0/glib/gthread.h +191 -3
  39. package/include/glib-2.0/glib-unix.h +7 -1
  40. package/include/glib-2.0/gobject/genums.h +6 -6
  41. package/include/glib-2.0/gobject/glib-types.h +11 -0
  42. package/include/glib-2.0/gobject/gsignal.h +16 -6
  43. package/include/hwy/aligned_allocator.h +171 -6
  44. package/include/hwy/base.h +1765 -543
  45. package/include/hwy/cache_control.h +24 -6
  46. package/include/hwy/detect_compiler_arch.h +23 -2
  47. package/include/hwy/detect_targets.h +56 -13
  48. package/include/hwy/foreach_target.h +24 -0
  49. package/include/hwy/highway.h +20 -3
  50. package/include/hwy/ops/arm_neon-inl.h +1086 -667
  51. package/include/hwy/ops/arm_sve-inl.h +1091 -235
  52. package/include/hwy/ops/emu128-inl.h +271 -196
  53. package/include/hwy/ops/generic_ops-inl.h +2270 -399
  54. package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
  55. package/include/hwy/ops/rvv-inl.h +1043 -311
  56. package/include/hwy/ops/scalar-inl.h +189 -159
  57. package/include/hwy/ops/set_macros-inl.h +66 -6
  58. package/include/hwy/ops/shared-inl.h +175 -56
  59. package/include/hwy/ops/wasm_128-inl.h +153 -136
  60. package/include/hwy/ops/x86_128-inl.h +1647 -646
  61. package/include/hwy/ops/x86_256-inl.h +1003 -370
  62. package/include/hwy/ops/x86_512-inl.h +948 -353
  63. package/include/hwy/per_target.h +4 -0
  64. package/include/hwy/profiler.h +648 -0
  65. package/include/hwy/robust_statistics.h +2 -2
  66. package/include/hwy/targets.h +18 -11
  67. package/include/hwy/timer.h +11 -0
  68. package/include/libpng16/png.h +32 -29
  69. package/include/libpng16/pngconf.h +2 -2
  70. package/include/libpng16/pnglibconf.h +7 -2
  71. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  72. package/include/libxml2/libxml/parser.h +16 -7
  73. package/include/libxml2/libxml/xmlIO.h +0 -1
  74. package/include/libxml2/libxml/xmlversion.h +4 -4
  75. package/include/pango-1.0/pango/pango-features.h +3 -3
  76. package/include/pango-1.0/pango/pango-fontmap.h +7 -0
  77. package/include/pixman-1/pixman-version.h +2 -2
  78. package/include/png.h +32 -29
  79. package/include/pngconf.h +2 -2
  80. package/include/pnglibconf.h +7 -2
  81. package/include/vips/connection.h +9 -3
  82. package/include/vips/util.h +0 -9
  83. package/include/vips/version.h +4 -4
  84. package/package.json +1 -1
  85. package/versions.json +11 -11
@@ -17,6 +17,7 @@
17
17
  #define HIGHWAY_HWY_PER_TARGET_H_
18
18
 
19
19
  #include <stddef.h>
20
+ #include <stdint.h>
20
21
 
21
22
  #include "hwy/highway_export.h"
22
23
 
@@ -25,6 +26,9 @@
25
26
 
26
27
  namespace hwy {
27
28
 
29
+ // Returns the HWY_TARGET which HWY_DYNAMIC_DISPATCH selected.
30
+ HWY_DLLEXPORT int64_t DispatchedTarget();
31
+
28
32
  // Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
29
33
  //
30
34
  // Do not cache the result, which may change after calling DisableTargets, or
@@ -0,0 +1,648 @@
1
+ // Copyright 2017 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #ifndef HIGHWAY_HWY_PROFILER_H_
16
+ #define HIGHWAY_HWY_PROFILER_H_
17
+
18
+ // High precision, low overhead time measurements. Returns exact call counts and
19
+ // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
20
+ //
21
+ // Uses RAII to capture begin/end timestamps, with user-specified zone names:
22
+ // { PROFILER_ZONE("name"); /*code*/ } or
23
+ // the name of the current function:
24
+ // void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
25
+ //
26
+ // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
27
+ // print call counts and average durations [CPU cycles] to stdout, sorted in
28
+ // descending order of total duration.
29
+ //
30
+ // The binary MUST be built with --dynamic_mode=off because we rely on the data
31
+ // segments being nearby; if not, an assertion will likely fail.
32
+
33
+ #include "hwy/base.h"
34
+
35
+ // Configuration settings:
36
+
37
+ // If zero, this file has no effect and no measurements will be recorded.
38
+ #ifndef PROFILER_ENABLED
39
+ #define PROFILER_ENABLED 0
40
+ #endif
41
+
42
+ // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
43
+ // enters at least one zone. Once this buffer is full, the thread will analyze
44
+ // and discard packets, thus temporarily adding some observer overhead.
45
+ // Each zone occupies 16 bytes.
46
+ #ifndef PROFILER_THREAD_STORAGE
47
+ #define PROFILER_THREAD_STORAGE 200ULL
48
+ #endif
49
+
50
+ #if PROFILER_ENABLED || HWY_IDE
51
+
52
+ #include <stddef.h>
53
+ #include <stdint.h>
54
+ #include <stdio.h>
55
+ #include <string.h> // strcmp
56
+
57
+ #include <algorithm> // std::sort
58
+ #include <atomic>
59
+
60
+ #include "hwy/aligned_allocator.h"
61
+ #include "hwy/cache_control.h" // FlushStream
62
+ // #include "hwy/contrib/sort/vqsort.h"
63
+ #include "hwy/highway.h" // Stream
64
+ #include "hwy/robust_statistics.h"
65
+ #include "hwy/timer-inl.h"
66
+ #include "hwy/timer.h"
67
+
68
+ #define PROFILER_PRINT_OVERHEAD 0
69
+
70
+ namespace hwy {
71
+
72
+ // Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):
73
+
74
+ // How many threads can actually enter a zone (those that don't do not count).
75
+ // Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
76
+ // WARNING: a fiber library can spawn hundreds of threads.
77
+ static constexpr size_t kMaxThreads = 256;
78
+
79
+ static constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
80
+
81
+ static constexpr size_t kMaxZones = 256; // Total number of zones.
82
+
83
+ // Overwrites "to" without loading it into the cache (read-for-ownership).
84
+ // Both pointers must be aligned.
85
+ HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
86
+ uint64_t* HWY_RESTRICT to) {
87
+ namespace hn = HWY_NAMESPACE;
88
+ const hn::ScalableTag<uint64_t> d;
89
+ for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {
90
+ hn::Stream(hn::Load(d, from + i), d, to + i);
91
+ }
92
+ }
93
+
94
+ #pragma pack(push, 1)
95
+
96
+ // Represents zone entry/exit events. Stores a full-resolution timestamp plus
97
+ // an offset (representing zone name or identifying exit packets). POD.
98
+ class Packet {
99
+ public:
100
+ // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
101
+ // (governed by kMaxZones). We have seen multi-megabyte offsets.
102
+ static constexpr size_t kOffsetBits = 25;
103
+ static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
104
+
105
+ // We need full-resolution timestamps; at an effective rate of 4 GHz,
106
+ // this permits 1 minute zone durations (for longer durations, split into
107
+ // multiple zones). Wraparound is handled by masking.
108
+ static constexpr size_t kTimestampBits = 64 - kOffsetBits;
109
+ static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
110
+
111
+ static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
112
+ HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
113
+
114
+ Packet packet;
115
+ packet.bits_ =
116
+ (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
117
+ return packet;
118
+ }
119
+
120
+ uint64_t Timestamp() const { return bits_ & kTimestampMask; }
121
+
122
+ size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
123
+
124
+ private:
125
+ uint64_t bits_;
126
+ };
127
+ static_assert(sizeof(Packet) == 8, "Wrong Packet size");
128
+
129
+ // Returns the address of a string literal. Assuming zone names are also
130
+ // literals and stored nearby, we can represent them as offsets, which are
131
+ // faster to compute than hashes or even a static index.
132
+ //
133
+ // This function must not be static - each call (even from other translation
134
+ // units) must return the same value.
135
+ inline const char* StringOrigin() {
136
+ // Chosen such that no zone name is a prefix nor suffix of this string
137
+ // to ensure they aren't merged (offset 0 identifies zone-exit packets).
138
+ static const char* string_origin = "__#__";
139
+ return string_origin - Packet::kOffsetBias;
140
+ }
141
+
142
+ // Representation of an active zone, stored in a stack. Used to deduct
143
+ // child duration from the parent's self time. POD.
144
+ struct Node {
145
+ Packet packet;
146
+ uint64_t child_total;
147
+ };
148
+ static_assert(sizeof(Node) == 16, "Wrong Node size");
149
+
150
+ // Holds statistics for all zones with the same name. POD.
151
+ struct Accumulator {
152
+ static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
153
+
154
+ uint64_t BiasedOffset() const { return u128.lo >> kNumCallBits; }
155
+ uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }
156
+ uint64_t Duration() const { return u128.hi; }
157
+
158
+ void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
159
+ u128.hi = duration;
160
+ u128.lo = (biased_offset << kNumCallBits) + num_calls;
161
+ }
162
+
163
+ void Add(uint64_t num_calls, uint64_t duration) {
164
+ u128.lo += num_calls;
165
+ u128.hi += duration;
166
+ }
167
+
168
+ // For fast sorting by duration, which must therefore be the hi element.
169
+ // lo holds BiasedOffset and NumCalls.
170
+ uint128_t u128;
171
+ };
172
+ static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
173
+
174
+ template <typename T>
175
+ inline T ClampedSubtract(const T minuend, const T subtrahend) {
176
+ if (subtrahend > minuend) {
177
+ return 0;
178
+ }
179
+ return minuend - subtrahend;
180
+ }
181
+
182
+ // Per-thread call graph (stack) and Accumulator for each zone.
183
+ class Results {
184
+ public:
185
+ Results() { ZeroBytes(zones_, sizeof(zones_)); }
186
+
187
+ // Used for computing overhead when this thread encounters its first Zone.
188
+ // This has no observable effect apart from increasing "analyze_elapsed_".
189
+ uint64_t ZoneDuration(const Packet* packets) {
190
+ HWY_DASSERT(depth_ == 0);
191
+ HWY_DASSERT(num_zones_ == 0);
192
+ AnalyzePackets(packets, 2);
193
+ const uint64_t duration = zones_[0].Duration();
194
+ zones_[0].Set(0, 0, 0);
195
+ HWY_DASSERT(depth_ == 0);
196
+ num_zones_ = 0;
197
+ return duration;
198
+ }
199
+
200
+ void SetSelfOverhead(const uint64_t self_overhead) {
201
+ self_overhead_ = self_overhead;
202
+ }
203
+
204
+ void SetChildOverhead(const uint64_t child_overhead) {
205
+ child_overhead_ = child_overhead;
206
+ }
207
+
208
+ // Draw all required information from the packets, which can be discarded
209
+ // afterwards. Called whenever this thread's storage is full.
210
+ void AnalyzePackets(const Packet* packets, const size_t num_packets) {
211
+ namespace hn = HWY_NAMESPACE;
212
+ const uint64_t t0 = hn::timer::Start();
213
+
214
+ for (size_t i = 0; i < num_packets; ++i) {
215
+ const Packet p = packets[i];
216
+ // Entering a zone
217
+ if (p.BiasedOffset() != Packet::kOffsetBias) {
218
+ HWY_DASSERT(depth_ < kMaxDepth);
219
+ nodes_[depth_].packet = p;
220
+ nodes_[depth_].child_total = 0;
221
+ ++depth_;
222
+ continue;
223
+ }
224
+
225
+ HWY_DASSERT(depth_ != 0);
226
+ const Node& node = nodes_[depth_ - 1];
227
+ // Masking correctly handles unsigned wraparound.
228
+ const uint64_t duration =
229
+ (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
230
+ const uint64_t self_duration = ClampedSubtract(
231
+ duration, self_overhead_ + child_overhead_ + node.child_total);
232
+
233
+ UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
234
+ --depth_;
235
+
236
+ // Deduct this nested node's time from its parent's self_duration.
237
+ if (depth_ != 0) {
238
+ nodes_[depth_ - 1].child_total += duration + child_overhead_;
239
+ }
240
+ }
241
+
242
+ const uint64_t t1 = hn::timer::Stop();
243
+ analyze_elapsed_ += t1 - t0;
244
+ }
245
+
246
+ // Incorporates results from another thread. Call after all threads have
247
+ // exited any zones.
248
+ void Assimilate(const Results& other) {
249
+ namespace hn = HWY_NAMESPACE;
250
+ const uint64_t t0 = hn::timer::Start();
251
+ HWY_DASSERT(depth_ == 0);
252
+ HWY_DASSERT(other.depth_ == 0);
253
+
254
+ for (size_t i = 0; i < other.num_zones_; ++i) {
255
+ const Accumulator& zone = other.zones_[i];
256
+ UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
257
+ }
258
+ const uint64_t t1 = hn::timer::Stop();
259
+ analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
260
+ }
261
+
262
+ // Single-threaded.
263
+ void Print() {
264
+ namespace hn = HWY_NAMESPACE;
265
+ const uint64_t t0 = hn::timer::Start();
266
+ MergeDuplicates();
267
+
268
+ // Sort by decreasing total (self) cost.
269
+ // VQSort(&zones_[0].u128, num_zones_, SortDescending());
270
+ std::sort(zones_, zones_ + num_zones_,
271
+ [](const Accumulator& r1, const Accumulator& r2) {
272
+ return r1.Duration() > r2.Duration();
273
+ });
274
+
275
+ const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
276
+
277
+ const char* string_origin = StringOrigin();
278
+ for (size_t i = 0; i < num_zones_; ++i) {
279
+ const Accumulator& r = zones_[i];
280
+ const uint64_t num_calls = r.NumCalls();
281
+ printf("%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),
282
+ num_calls, r.Duration() / num_calls,
283
+ static_cast<double>(r.Duration()) * inv_freq);
284
+ }
285
+
286
+ const uint64_t t1 = hn::timer::Stop();
287
+ analyze_elapsed_ += t1 - t0;
288
+ printf("Total analysis [s]: %f\n",
289
+ static_cast<double>(analyze_elapsed_) * inv_freq);
290
+ }
291
+
292
+ private:
293
+ // Updates an existing Accumulator (uniquely identified by biased_offset) or
294
+ // adds one if this is the first time this thread analyzed that zone.
295
+ // Uses a self-organizing list data structure, which avoids dynamic memory
296
+ // allocations and is far faster than unordered_map. Loads, updates and
297
+ // stores the entire Accumulator with vector instructions.
298
+ void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
299
+ const uint64_t duration) {
300
+ HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
301
+
302
+ // Special case for first zone: (maybe) update, without swapping.
303
+ if (zones_[0].BiasedOffset() == biased_offset) {
304
+ zones_[0].Add(num_calls, duration);
305
+ HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
306
+ return;
307
+ }
308
+
309
+ // Look for a zone with the same offset.
310
+ for (size_t i = 1; i < num_zones_; ++i) {
311
+ if (zones_[i].BiasedOffset() == biased_offset) {
312
+ zones_[i].Add(num_calls, duration);
313
+ HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
314
+ // Swap with predecessor (more conservative than move to front,
315
+ // but at least as successful).
316
+ const Accumulator prev = zones_[i - 1];
317
+ zones_[i - 1] = zones_[i];
318
+ zones_[i] = prev;
319
+ return;
320
+ }
321
+ }
322
+
323
+ // Not found; create a new Accumulator.
324
+ HWY_DASSERT(num_zones_ < kMaxZones);
325
+ Accumulator* HWY_RESTRICT zone = zones_ + num_zones_;
326
+ zone->Set(biased_offset, num_calls, duration);
327
+ HWY_DASSERT(zone->BiasedOffset() == biased_offset);
328
+ ++num_zones_;
329
+ }
330
+
331
+ // Each instantiation of a function template seems to get its own copy of
332
+ // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
333
+ // acceptable because we only expect a few dozen zones.
334
+ void MergeDuplicates() {
335
+ const char* string_origin = StringOrigin();
336
+ for (size_t i = 0; i < num_zones_; ++i) {
337
+ const size_t biased_offset = zones_[i].BiasedOffset();
338
+ const char* name = string_origin + biased_offset;
339
+ // Separate num_calls from biased_offset so we can add them together.
340
+ uint64_t num_calls = zones_[i].NumCalls();
341
+
342
+ // Add any subsequent duplicates to num_calls and total_duration.
343
+ for (size_t j = i + 1; j < num_zones_;) {
344
+ if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
345
+ num_calls += zones_[j].NumCalls();
346
+ zones_[i].Add(0, zones_[j].Duration());
347
+ // Fill hole with last item.
348
+ zones_[j] = zones_[--num_zones_];
349
+ } else { // Name differed, try next Accumulator.
350
+ ++j;
351
+ }
352
+ }
353
+
354
+ HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));
355
+
356
+ // Re-pack regardless of whether any duplicates were found.
357
+ zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
358
+ }
359
+ }
360
+
361
+ uint64_t analyze_elapsed_ = 0;
362
+ uint64_t self_overhead_ = 0;
363
+ uint64_t child_overhead_ = 0;
364
+
365
+ size_t depth_ = 0; // Number of active zones.
366
+ size_t num_zones_ = 0; // Number of retired zones.
367
+
368
+ alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack
369
+ alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list
370
+ };
371
+
372
+ // Per-thread packet storage, dynamically allocated.
373
+ class ThreadSpecific {
374
+ static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);
375
+
376
+ public:
377
+ // "name" is used to sanity-check offsets fit in kOffsetBits.
378
+ explicit ThreadSpecific(const char* name)
379
+ : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),
380
+ packets_(AllocateAligned<Packet>(max_packets_)),
381
+ num_packets_(0),
382
+ string_origin_(StringOrigin()) {
383
+ // Even in optimized builds, verify that this zone's name offset fits
384
+ // within the allotted space. If not, UpdateOrAdd is likely to overrun
385
+ // zones_[]. Checking here on the cold path (only reached once per thread)
386
+ // is cheap, but it only covers one zone.
387
+ const size_t biased_offset = name - string_origin_;
388
+ HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
389
+ }
390
+
391
+ // Depends on Zone => defined below.
392
+ void ComputeOverhead();
393
+
394
+ void WriteEntry(const char* name, const uint64_t timestamp) {
395
+ const size_t biased_offset = name - string_origin_;
396
+ Write(Packet::Make(biased_offset, timestamp));
397
+ }
398
+
399
+ void WriteExit(const uint64_t timestamp) {
400
+ const size_t biased_offset = Packet::kOffsetBias;
401
+ Write(Packet::Make(biased_offset, timestamp));
402
+ }
403
+
404
+ void AnalyzeRemainingPackets() {
405
+ // Ensures prior weakly-ordered streaming stores are globally visible.
406
+ FlushStream();
407
+
408
+ // Storage full => empty it.
409
+ if (num_packets_ + buffer_size_ > max_packets_) {
410
+ results_.AnalyzePackets(packets_.get(), num_packets_);
411
+ num_packets_ = 0;
412
+ }
413
+ CopyBytes(buffer_, packets_.get() + num_packets_,
414
+ buffer_size_ * sizeof(Packet));
415
+ num_packets_ += buffer_size_;
416
+
417
+ results_.AnalyzePackets(packets_.get(), num_packets_);
418
+ num_packets_ = 0;
419
+ }
420
+
421
+ Results& GetResults() { return results_; }
422
+
423
+ private:
424
+ // Write packet to buffer/storage, emptying them as needed.
425
+ void Write(const Packet packet) {
426
+ // Buffer full => copy to storage.
427
+ if (buffer_size_ == kBufferCapacity) {
428
+ // Storage full => empty it.
429
+ if (num_packets_ + kBufferCapacity > max_packets_) {
430
+ results_.AnalyzePackets(packets_.get(), num_packets_);
431
+ num_packets_ = 0;
432
+ }
433
+ // This buffering halves observer overhead and decreases the overall
434
+ // runtime by about 3%. Casting is safe because the first member is u64.
435
+ StreamCacheLine(
436
+ reinterpret_cast<const uint64_t*>(buffer_),
437
+ reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
438
+ num_packets_ += kBufferCapacity;
439
+ buffer_size_ = 0;
440
+ }
441
+ buffer_[buffer_size_] = packet;
442
+ ++buffer_size_;
443
+ }
444
+
445
+ // Write-combining buffer to avoid cache pollution. Must be the first
446
+ // non-static member to ensure cache-line alignment.
447
+ Packet buffer_[kBufferCapacity];
448
+ size_t buffer_size_ = 0;
449
+
450
+ const size_t max_packets_;
451
+ // Contiguous storage for zone enter/exit packets.
452
+ AlignedFreeUniquePtr<Packet[]> packets_;
453
+ size_t num_packets_;
454
+ // Cached here because we already read this cache line on zone entry/exit.
455
+ const char* HWY_RESTRICT string_origin_;
456
+ Results results_;
457
+ };
458
+
459
+ class ThreadList {
460
+ public:
461
+ // Called from any thread.
462
+ ThreadSpecific* Add(const char* name) {
463
+ const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
464
+ HWY_DASSERT(index < kMaxThreads);
465
+
466
+ ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
467
+ threads_[index].store(ts, std::memory_order_release);
468
+ return ts;
469
+ }
470
+
471
+ // Single-threaded.
472
+ void PrintResults() {
473
+ const auto acq = std::memory_order_acquire;
474
+ const size_t num_threads = num_threads_.load(acq);
475
+
476
+ ThreadSpecific* main = threads_[0].load(acq);
477
+ main->AnalyzeRemainingPackets();
478
+
479
+ for (size_t i = 1; i < num_threads; ++i) {
480
+ ThreadSpecific* ts = threads_[i].load(acq);
481
+ ts->AnalyzeRemainingPackets();
482
+ main->GetResults().Assimilate(ts->GetResults());
483
+ }
484
+
485
+ if (num_threads != 0) {
486
+ main->GetResults().Print();
487
+ }
488
+ }
489
+
490
+ private:
491
+ // Owning pointers.
492
+ alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
493
+ std::atomic<size_t> num_threads_{0};
494
+ };
495
+
496
+ // RAII zone enter/exit recorder constructed by the ZONE macro; also
497
+ // responsible for initializing ThreadSpecific.
498
+ class Zone {
499
+ public:
500
+ // "name" must be a string literal (see StringOrigin).
501
+ HWY_NOINLINE explicit Zone(const char* name) {
502
+ HWY_FENCE;
503
+ ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();
504
+ if (HWY_UNLIKELY(thread_specific == nullptr)) {
505
+ // Ensure the CPU supports our timer.
506
+ char cpu[100];
507
+ if (!platform::HaveTimerStop(cpu)) {
508
+ HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
509
+ }
510
+
511
+ thread_specific = StaticThreadSpecific() = Threads().Add(name);
512
+ // Must happen after setting StaticThreadSpecific, because ComputeOverhead
513
+ // also calls Zone().
514
+ thread_specific->ComputeOverhead();
515
+ }
516
+
517
+ // (Capture timestamp ASAP, not inside WriteEntry.)
518
+ HWY_FENCE;
519
+ const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
520
+ thread_specific->WriteEntry(name, timestamp);
521
+ }
522
+
523
+ HWY_NOINLINE ~Zone() {
524
+ HWY_FENCE;
525
+ const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();
526
+ StaticThreadSpecific()->WriteExit(timestamp);
527
+ HWY_FENCE;
528
+ }
529
+
530
+ // Call exactly once after all threads have exited all zones.
531
+ static void PrintResults() { Threads().PrintResults(); }
532
+
533
+ private:
534
+ // Returns reference to the thread's ThreadSpecific pointer (initially null).
535
+ // Function-local static avoids needing a separate definition.
536
+ static ThreadSpecific*& StaticThreadSpecific() {
537
+ static thread_local ThreadSpecific* thread_specific;
538
+ return thread_specific;
539
+ }
540
+
541
+ // Returns the singleton ThreadList. Non time-critical.
542
+ static ThreadList& Threads() {
543
+ static ThreadList threads_;
544
+ return threads_;
545
+ }
546
+ };
547
+
548
+ // Creates a zone starting from here until the end of the current scope.
549
+ // Timestamps will be recorded when entering and exiting the zone.
550
+ // "name" must be a string literal, which is ensured by merging with "".
551
+ #define PROFILER_ZONE(name) \
552
+ HWY_FENCE; \
553
+ const hwy::Zone zone("" name); \
554
+ HWY_FENCE
555
+
556
+ // Creates a zone for an entire function (when placed at its beginning).
557
+ // Shorter/more convenient than ZONE.
558
+ #define PROFILER_FUNC \
559
+ HWY_FENCE; \
560
+ const hwy::Zone zone(__func__); \
561
+ HWY_FENCE
562
+
563
+ #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults
564
+
565
+ inline void ThreadSpecific::ComputeOverhead() {
566
+ namespace hn = HWY_NAMESPACE;
567
+ // Delay after capturing timestamps before/after the actual zone runs. Even
568
+ // with frequency throttling disabled, this has a multimodal distribution,
569
+ // including 32, 34, 48, 52, 59, 62.
570
+ uint64_t self_overhead;
571
+ {
572
+ const size_t kNumSamples = 32;
573
+ uint32_t samples[kNumSamples];
574
+ for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
575
+ const size_t kNumDurations = 1024;
576
+ uint32_t durations[kNumDurations];
577
+
578
+ for (size_t idx_duration = 0; idx_duration < kNumDurations;
579
+ ++idx_duration) {
580
+ {
581
+ PROFILER_ZONE("Dummy Zone (never shown)");
582
+ }
583
+ const uint64_t duration = results_.ZoneDuration(buffer_);
584
+ buffer_size_ = 0;
585
+ durations[idx_duration] = static_cast<uint32_t>(duration);
586
+ HWY_DASSERT(num_packets_ == 0);
587
+ }
588
+ robust_statistics::CountingSort(durations, kNumDurations);
589
+ samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
590
+ }
591
+ // Median.
592
+ robust_statistics::CountingSort(samples, kNumSamples);
593
+ self_overhead = samples[kNumSamples / 2];
594
+ if (PROFILER_PRINT_OVERHEAD) {
595
+ printf("Overhead: %zu\n", self_overhead);
596
+ }
597
+ results_.SetSelfOverhead(self_overhead);
598
+ }
599
+
600
+ // Delay before capturing start timestamp / after end timestamp.
601
+ const size_t kNumSamples = 32;
602
+ uint32_t samples[kNumSamples];
603
+ for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
604
+ const size_t kNumDurations = 16;
605
+ uint32_t durations[kNumDurations];
606
+ for (size_t idx_duration = 0; idx_duration < kNumDurations;
607
+ ++idx_duration) {
608
+ const size_t kReps = 10000;
609
+ // Analysis time should not be included => must fit within buffer.
610
+ HWY_DASSERT(kReps * 2 < max_packets_);
611
+ std::atomic_thread_fence(std::memory_order_seq_cst);
612
+ const uint64_t t0 = hn::timer::Start();
613
+ for (size_t i = 0; i < kReps; ++i) {
614
+ PROFILER_ZONE("Dummy");
615
+ }
616
+ FlushStream();
617
+ const uint64_t t1 = hn::timer::Stop();
618
+ HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
619
+ buffer_size_ = 0;
620
+ num_packets_ = 0;
621
+ const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
622
+ durations[idx_duration] =
623
+ static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
624
+ }
625
+ robust_statistics::CountingSort(durations, kNumDurations);
626
+ samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
627
+ }
628
+ robust_statistics::CountingSort(samples, kNumSamples);
629
+ const uint64_t child_overhead = samples[9 * kNumSamples / 10];
630
+ if (PROFILER_PRINT_OVERHEAD) {
631
+ printf("Child overhead: %zu\n", child_overhead);
632
+ }
633
+ results_.SetChildOverhead(child_overhead);
634
+ }
635
+
636
+ #pragma pack(pop)
637
+
638
+ } // namespace hwy
639
+
640
+ #endif // PROFILER_ENABLED || HWY_IDE
641
+
642
+ #if !PROFILER_ENABLED && !HWY_IDE
643
+ #define PROFILER_ZONE(name)
644
+ #define PROFILER_FUNC
645
+ #define PROFILER_PRINT_RESULTS()
646
+ #endif
647
+
648
+ #endif // HIGHWAY_HWY_PROFILER_H_
@@ -135,8 +135,8 @@ T MedianAbsoluteDeviation(const T* values, const size_t num_values,
135
135
  std::vector<T> abs_deviations;
136
136
  abs_deviations.reserve(num_values);
137
137
  for (size_t i = 0; i < num_values; ++i) {
138
- const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
139
- static_cast<int64_t>(median));
138
+ const int64_t abs = ScalarAbs(static_cast<int64_t>(values[i]) -
139
+ static_cast<int64_t>(median));
140
140
  abs_deviations.push_back(static_cast<T>(abs));
141
141
  }
142
142
  return Median(abs_deviations.data(), num_values);