@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-deprecated.h +4 -4
  3. package/include/harfbuzz/hb-font.h +120 -9
  4. package/include/harfbuzz/hb-version.h +3 -3
  5. package/include/hwy/abort.h +2 -19
  6. package/include/hwy/aligned_allocator.h +11 -7
  7. package/include/hwy/auto_tune.h +504 -0
  8. package/include/hwy/base.h +425 -104
  9. package/include/hwy/cache_control.h +16 -0
  10. package/include/hwy/detect_compiler_arch.h +32 -1
  11. package/include/hwy/detect_targets.h +251 -67
  12. package/include/hwy/foreach_target.h +35 -0
  13. package/include/hwy/highway.h +185 -76
  14. package/include/hwy/nanobenchmark.h +1 -19
  15. package/include/hwy/ops/arm_neon-inl.h +969 -458
  16. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  17. package/include/hwy/ops/emu128-inl.h +97 -11
  18. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  19. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  20. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  21. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  22. package/include/hwy/ops/rvv-inl.h +546 -51
  23. package/include/hwy/ops/scalar-inl.h +77 -22
  24. package/include/hwy/ops/set_macros-inl.h +138 -17
  25. package/include/hwy/ops/shared-inl.h +50 -10
  26. package/include/hwy/ops/wasm_128-inl.h +137 -92
  27. package/include/hwy/ops/x86_128-inl.h +773 -214
  28. package/include/hwy/ops/x86_256-inl.h +712 -255
  29. package/include/hwy/ops/x86_512-inl.h +429 -753
  30. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  31. package/include/hwy/per_target.h +2 -1
  32. package/include/hwy/profiler.h +622 -486
  33. package/include/hwy/targets.h +62 -20
  34. package/include/hwy/timer-inl.h +8 -160
  35. package/include/hwy/timer.h +170 -3
  36. package/include/hwy/x86_cpuid.h +81 -0
  37. package/include/libheif/heif_cxx.h +25 -5
  38. package/include/libheif/heif_regions.h +5 -5
  39. package/include/libheif/heif_version.h +2 -2
  40. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  41. package/include/libxml2/libxml/xmlversion.h +4 -4
  42. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  43. package/include/pango-1.0/pango/pango-features.h +3 -3
  44. package/include/pango-1.0/pango/pango-font.h +30 -0
  45. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  46. package/include/pixman-1/pixman-version.h +2 -2
  47. package/include/webp/decode.h +11 -2
  48. package/include/webp/demux.h +2 -0
  49. package/include/webp/encode.h +2 -0
  50. package/include/webp/mux_types.h +1 -0
  51. package/include/webp/sharpyuv/sharpyuv.h +1 -1
  52. package/include/webp/types.h +2 -2
  53. package/include/zlib.h +3 -3
  54. package/package.json +1 -1
  55. package/versions.json +11 -11
@@ -15,6 +15,11 @@
15
15
  #ifndef HIGHWAY_HWY_PROFILER_H_
16
16
  #define HIGHWAY_HWY_PROFILER_H_
17
17
 
18
+ #include <stddef.h>
19
+ #include <stdint.h>
20
+
21
+ #include "hwy/highway_export.h"
22
+
18
23
  // High precision, low overhead time measurements. Returns exact call counts and
19
24
  // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
20
25
  //
@@ -22,627 +27,758 @@
22
27
  // { PROFILER_ZONE("name"); /*code*/ } or
23
28
  // the name of the current function:
24
29
  // void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
30
+ // You can reduce the overhead by passing a thread ID:
31
+ // `PROFILER_ZONE2(thread, name)`. The new and preferred API also allows
32
+ // passing flags, such as requesting inclusive time:
33
+ // `static const auto zone = profiler.AddZone("name", flags);` and then
34
+ // `PROFILER_ZONE3(profiler, thread, zone)`.
25
35
  //
26
- // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
36
+ // After all threads exit all zones, call `Profiler::Get().PrintResults()` to
27
37
  // print call counts and average durations [CPU cycles] to stdout, sorted in
28
38
  // descending order of total duration.
29
- //
30
- // The binary MUST be built with --dynamic_mode=off because we rely on the data
31
- // segments being nearby; if not, an assertion will likely fail.
32
39
 
33
- #include "hwy/base.h"
34
-
35
- // Configuration settings:
36
-
37
- // If zero, this file has no effect and no measurements will be recorded.
40
+ // If zero, mock `Profiler` and `profiler::Zone` will be defined.
38
41
  #ifndef PROFILER_ENABLED
39
42
  #define PROFILER_ENABLED 0
40
43
  #endif
41
44
 
42
- // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
43
- // enters at least one zone. Once this buffer is full, the thread will analyze
44
- // and discard packets, thus temporarily adding some observer overhead.
45
- // Each zone occupies 16 bytes.
46
- #ifndef PROFILER_THREAD_STORAGE
47
- #define PROFILER_THREAD_STORAGE 200ULL
48
- #endif
49
-
50
- #if PROFILER_ENABLED || HWY_IDE
51
-
52
- #include <stddef.h>
53
- #include <stdint.h>
45
+ #if PROFILER_ENABLED
54
46
  #include <stdio.h>
55
- #include <string.h> // strcmp
47
+ #include <string.h> // strcmp, strlen
56
48
 
57
49
  #include <algorithm> // std::sort
58
50
  #include <atomic>
51
+ #include <vector>
59
52
 
60
53
  #include "hwy/aligned_allocator.h"
61
- #include "hwy/cache_control.h" // FlushStream
62
- // #include "hwy/contrib/sort/vqsort.h"
63
- #include "hwy/highway.h" // Stream
64
- #include "hwy/robust_statistics.h"
65
- #include "hwy/timer-inl.h"
54
+ #include "hwy/base.h"
55
+ #include "hwy/bit_set.h"
66
56
  #include "hwy/timer.h"
67
-
68
- #define PROFILER_PRINT_OVERHEAD 0
57
+ #endif // PROFILER_ENABLED
69
58
 
70
59
  namespace hwy {
71
60
 
72
- // Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):
61
+ // Flags: we want type-safety (enum class) to catch mistakes such as confusing
62
+ // zone with flags. Base type (`uint32_t`) ensures it is safe to cast. Defined
63
+ // outside the `#if` because callers pass them to `PROFILER_ZONE3`. Keep in
64
+ // sync with `kNumFlags` below.
65
+ enum class ProfilerFlags : uint32_t {
66
+ kDefault = 0,
67
+ // The zone should report cumulative time, including all child zones. If not
68
+ // specified, zones report self-time, excluding child zones.
69
+ kInclusive = 1
70
+ };
73
71
 
74
- // How many threads can actually enter a zone (those that don't do not count).
75
- // Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
76
- // WARNING: a fiber library can spawn hundreds of threads.
77
- static constexpr size_t kMaxThreads = 256;
72
+ #if PROFILER_ENABLED
78
73
 
79
- static constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
74
+ // Implementation details.
75
+ namespace profiler {
80
76
 
81
- static constexpr size_t kMaxZones = 256; // Total number of zones.
77
+ HWY_INLINE_VAR constexpr size_t kNumFlags = 1;
82
78
 
83
- // Overwrites "to" without loading it into the cache (read-for-ownership).
84
- // Both pointers must be aligned.
85
- HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
86
- uint64_t* HWY_RESTRICT to) {
87
- namespace hn = HWY_NAMESPACE;
88
- const hn::ScalableTag<uint64_t> d;
89
- for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {
90
- hn::Stream(hn::Load(d, from + i), d, to + i);
91
- }
92
- }
79
+ // Upper bounds for fixed-size data structures, guarded via HWY_DASSERT:
93
80
 
94
- #pragma pack(push, 1)
81
+ // Maximum nesting of zones, chosen such that PerThread is 256 bytes.
82
+ HWY_INLINE_VAR constexpr size_t kMaxDepth = 13;
83
+ // Reports with more than ~50 are anyway difficult to read.
84
+ HWY_INLINE_VAR constexpr size_t kMaxZones = 128;
85
+ // Upper bound on threads that call `InitThread`, and `thread` arguments. Note
86
+ // that fiber libraries can spawn hundreds of threads. Enough for Turin cores.
87
+ HWY_INLINE_VAR constexpr size_t kMaxThreads = 256;
95
88
 
96
- // Represents zone entry/exit events. Stores a full-resolution timestamp plus
97
- // an offset (representing zone name or identifying exit packets). POD.
98
- class Packet {
89
+ // Type-safe wrapper for zone index plus flags, returned by `AddZone`.
90
+ class ZoneHandle {
99
91
  public:
100
- // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
101
- // (governed by kMaxZones). We have seen multi-megabyte offsets.
102
- static constexpr size_t kOffsetBits = 25;
103
- static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
92
+ ZoneHandle() : bits_(0) {} // for Accumulator member initialization
93
+
94
+ ZoneHandle(size_t zone_idx, ProfilerFlags flags) {
95
+ HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones);
96
+ const uint32_t flags_u = static_cast<uint32_t>(flags);
97
+ HWY_DASSERT(flags_u < (1u << kNumFlags));
98
+ bits_ = (static_cast<uint32_t>(zone_idx) << kNumFlags) | flags_u;
99
+ HWY_DASSERT(ZoneIdx() == zone_idx);
100
+ }
104
101
 
105
- // We need full-resolution timestamps; at an effective rate of 4 GHz,
106
- // this permits 1 minute zone durations (for longer durations, split into
107
- // multiple zones). Wraparound is handled by masking.
108
- static constexpr size_t kTimestampBits = 64 - kOffsetBits;
109
- static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
102
+ ZoneHandle(const ZoneHandle& other) = default;
103
+ ZoneHandle& operator=(const ZoneHandle& other) = default;
110
104
 
111
- static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
112
- HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
105
+ bool operator==(const ZoneHandle other) const { return bits_ == other.bits_; }
106
+ bool operator!=(const ZoneHandle other) const { return bits_ != other.bits_; }
113
107
 
114
- Packet packet;
115
- packet.bits_ =
116
- (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
117
- return packet;
108
+ size_t ZoneIdx() const {
109
+ HWY_DASSERT(bits_ != 0);
110
+ const size_t zone_idx = bits_ >> kNumFlags;
111
+ HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones);
112
+ return zone_idx;
118
113
  }
119
114
 
120
- uint64_t Timestamp() const { return bits_ & kTimestampMask; }
115
+ bool IsInclusive() const {
116
+ HWY_DASSERT(bits_ != 0);
117
+ return (bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive)) != 0;
118
+ }
121
119
 
122
- size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
120
+ // Returns a mask to zero/ignore child totals for inclusive zones.
121
+ uint64_t ChildTotalMask() const {
122
+ // Without this function, clang tends to generate a branch.
123
+ return IsInclusive() ? 0 : ~uint64_t{0};
124
+ }
123
125
 
124
126
  private:
125
- uint64_t bits_;
127
+ uint32_t bits_;
126
128
  };
127
- static_assert(sizeof(Packet) == 8, "Wrong Packet size");
128
129
 
129
- // Returns the address of a string literal. Assuming zone names are also
130
- // literals and stored nearby, we can represent them as offsets, which are
131
- // faster to compute than hashes or even a static index.
132
- //
133
- // This function must not be static - each call (even from other translation
134
- // units) must return the same value.
135
- inline const char* StringOrigin() {
136
- // Chosen such that no zone name is a prefix nor suffix of this string
137
- // to ensure they aren't merged (offset 0 identifies zone-exit packets).
138
- static const char* string_origin = "__#__";
139
- return string_origin - Packet::kOffsetBias;
140
- }
141
-
142
- // Representation of an active zone, stored in a stack. Used to deduct
143
- // child duration from the parent's self time. POD.
144
- struct Node {
145
- Packet packet;
146
- uint64_t child_total;
130
+ // Storage for zone names.
131
+ class Names {
132
+ static constexpr std::memory_order kRel = std::memory_order_relaxed;
133
+
134
+ public:
135
+ // Returns a copy of the `name` passed to `AddZone` that returned the
136
+ // given `zone`.
137
+ const char* Get(ZoneHandle zone) const { return ptrs_[zone.ZoneIdx()]; }
138
+
139
+ ZoneHandle AddZone(const char* name, ProfilerFlags flags) {
140
+ // Linear search whether it already exists.
141
+ const size_t num_zones = next_ptr_.load(kRel);
142
+ HWY_ASSERT(num_zones < kMaxZones);
143
+ for (size_t zone_idx = 1; zone_idx < num_zones; ++zone_idx) {
144
+ if (!strcmp(ptrs_[zone_idx], name)) {
145
+ return ZoneHandle(zone_idx, flags);
146
+ }
147
+ }
148
+
149
+ // Reserve the next `zone_idx` (index in `ptrs_`).
150
+ const size_t zone_idx = next_ptr_.fetch_add(1, kRel);
151
+
152
+ // Copy into `name` into `chars_`.
153
+ const size_t len = strlen(name) + 1;
154
+ const size_t pos = next_char_.fetch_add(len, kRel);
155
+ HWY_ASSERT(pos + len <= sizeof(chars_));
156
+ strcpy(chars_ + pos, name); // NOLINT
157
+
158
+ ptrs_[zone_idx] = chars_ + pos;
159
+ const ZoneHandle zone(zone_idx, flags);
160
+ HWY_DASSERT(!strcmp(Get(zone), name));
161
+ return zone;
162
+ }
163
+
164
+ private:
165
+ const char* ptrs_[kMaxZones];
166
+ std::atomic<size_t> next_ptr_{1}; // next zone_idx
167
+ char chars_[kMaxZones * 70];
168
+ std::atomic<size_t> next_char_{0};
147
169
  };
148
- static_assert(sizeof(Node) == 16, "Wrong Node size");
149
170
 
150
- // Holds statistics for all zones with the same name. POD.
171
+ // Holds total duration and number of calls. "Which thread entered it" is
172
+ // unnecessary because these are per-thread.
151
173
  struct Accumulator {
152
- static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
174
+ void Add(ZoneHandle new_zone, uint64_t self_duration) {
175
+ duration += self_duration;
153
176
 
154
- uint64_t BiasedOffset() const { return u128.lo >> kNumCallBits; }
155
- uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }
156
- uint64_t Duration() const { return u128.hi; }
177
+ // Only called for valid zones.
178
+ HWY_DASSERT(new_zone != ZoneHandle());
179
+ // Our zone might not have been set yet.
180
+ HWY_DASSERT(zone == ZoneHandle() || zone == new_zone);
181
+ zone = new_zone;
157
182
 
158
- void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
159
- u128.hi = duration;
160
- u128.lo = (biased_offset << kNumCallBits) + num_calls;
183
+ num_calls += 1;
161
184
  }
162
185
 
163
- void Add(uint64_t num_calls, uint64_t duration) {
164
- u128.lo += num_calls;
165
- u128.hi += duration;
186
+ void Assimilate(Accumulator& other) {
187
+ duration += other.duration;
188
+ other.duration = 0;
189
+
190
+ // `ZoneSet` ensures we only call this for non-empty `other`.
191
+ HWY_DASSERT(other.zone != ZoneHandle());
192
+ // Our zone might not have been set yet.
193
+ HWY_DASSERT(zone == ZoneHandle() || zone == other.zone);
194
+ zone = other.zone;
195
+
196
+ num_calls += other.num_calls;
197
+ other.num_calls = 0;
166
198
  }
167
199
 
168
- // For fast sorting by duration, which must therefore be the hi element.
169
- // lo holds BiasedOffset and NumCalls.
170
- uint128_t u128;
200
+ uint64_t duration = 0;
201
+ ZoneHandle zone; // flags are used by `Results::Print`
202
+ uint32_t num_calls = 0;
171
203
  };
172
204
  static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
173
205
 
174
- template <typename T>
175
- inline T ClampedSubtract(const T minuend, const T subtrahend) {
176
- if (subtrahend > minuend) {
177
- return 0;
178
- }
179
- return minuend - subtrahend;
180
- }
181
-
182
- // Per-thread call graph (stack) and Accumulator for each zone.
183
- class Results {
206
+ // Modified from `hwy::BitSet4096`. Avoids the second-level `BitSet64`, because
207
+ // we only need `kMaxZones` = 128.
208
+ class ZoneSet {
184
209
  public:
185
- Results() { ZeroBytes(zones_, sizeof(zones_)); }
210
+ // No harm if `i` is already set.
211
+ void Set(size_t i) {
212
+ HWY_DASSERT(i < kMaxZones);
213
+ const size_t idx = i / 64;
214
+ const size_t mod = i % 64;
215
+ bits_[idx].Set(mod);
216
+ HWY_DASSERT(Get(i));
217
+ }
186
218
 
187
- // Used for computing overhead when this thread encounters its first Zone.
188
- // This has no observable effect apart from increasing "analyze_elapsed_".
189
- uint64_t ZoneDuration(const Packet* packets) {
190
- HWY_DASSERT(depth_ == 0);
191
- HWY_DASSERT(num_zones_ == 0);
192
- AnalyzePackets(packets, 2);
193
- const uint64_t duration = zones_[0].Duration();
194
- zones_[0].Set(0, 0, 0);
195
- HWY_DASSERT(depth_ == 0);
196
- num_zones_ = 0;
197
- return duration;
219
+ void Clear(size_t i) {
220
+ HWY_DASSERT(i < kMaxZones);
221
+ const size_t idx = i / 64;
222
+ const size_t mod = i % 64;
223
+ bits_[idx].Clear(mod);
224
+ HWY_DASSERT(!Get(i));
198
225
  }
199
226
 
200
- void SetSelfOverhead(const uint64_t self_overhead) {
201
- self_overhead_ = self_overhead;
227
+ bool Get(size_t i) const {
228
+ HWY_DASSERT(i < kMaxZones);
229
+ const size_t idx = i / 64;
230
+ const size_t mod = i % 64;
231
+ return bits_[idx].Get(mod);
202
232
  }
203
233
 
204
- void SetChildOverhead(const uint64_t child_overhead) {
205
- child_overhead_ = child_overhead;
234
+ // Returns lowest i such that Get(i). Caller must ensure Any() beforehand!
235
+ size_t First() const {
236
+ HWY_DASSERT(bits_[0].Any() || bits_[1].Any());
237
+ const size_t idx = bits_[0].Any() ? 0 : 1;
238
+ return idx * 64 + bits_[idx].First();
206
239
  }
207
240
 
208
- // Draw all required information from the packets, which can be discarded
209
- // afterwards. Called whenever this thread's storage is full.
210
- void AnalyzePackets(const Packet* packets, const size_t num_packets) {
211
- namespace hn = HWY_NAMESPACE;
212
- const uint64_t t0 = hn::timer::Start();
241
+ // Calls `func(i)` for each `i` in the set. It is safe for `func` to modify
242
+ // the set, but the current Foreach call is only affected if changing one of
243
+ // the not yet visited BitSet64 for which Any() is true.
244
+ template <class Func>
245
+ void Foreach(const Func& func) const {
246
+ bits_[0].Foreach([&func](size_t mod) { func(mod); });
247
+ bits_[1].Foreach([&func](size_t mod) { func(64 + mod); });
248
+ }
213
249
 
214
- for (size_t i = 0; i < num_packets; ++i) {
215
- const Packet p = packets[i];
216
- // Entering a zone
217
- if (p.BiasedOffset() != Packet::kOffsetBias) {
218
- HWY_DASSERT(depth_ < kMaxDepth);
219
- nodes_[depth_].packet = p;
220
- nodes_[depth_].child_total = 0;
221
- ++depth_;
222
- continue;
223
- }
250
+ size_t Count() const { return bits_[0].Count() + bits_[1].Count(); }
224
251
 
225
- HWY_DASSERT(depth_ != 0);
226
- const Node& node = nodes_[depth_ - 1];
227
- // Masking correctly handles unsigned wraparound.
228
- const uint64_t duration =
229
- (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
230
- const uint64_t self_duration = ClampedSubtract(
231
- duration, self_overhead_ + child_overhead_ + node.child_total);
252
+ private:
253
+ static_assert(kMaxZones == 128, "Update ZoneSet");
254
+ BitSet64 bits_[2];
255
+ };
232
256
 
233
- UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
234
- --depth_;
257
+ // Modified from `ZoneSet`.
258
+ class ThreadSet {
259
+ public:
260
+ // No harm if `i` is already set.
261
+ void Set(size_t i) {
262
+ HWY_DASSERT(i < kMaxThreads);
263
+ const size_t idx = i / 64;
264
+ const size_t mod = i % 64;
265
+ bits_[idx].Set(mod);
266
+ }
235
267
 
236
- // Deduct this nested node's time from its parent's self_duration.
237
- if (depth_ != 0) {
238
- nodes_[depth_ - 1].child_total += duration + child_overhead_;
239
- }
268
+ size_t Count() const {
269
+ size_t total = 0;
270
+ for (const BitSet64& bits : bits_) {
271
+ total += bits.Count();
240
272
  }
273
+ return total;
274
+ }
275
+
276
+ private:
277
+ BitSet64 bits_[DivCeil(kMaxThreads, size_t{64})];
278
+ };
279
+
280
+ // Durations are per-CPU, but end to end performance is defined by wall time.
281
+ // Assuming fork-join parallelism, zones are entered by multiple threads
282
+ // concurrently, which means the total number of unique threads is also the
283
+ // degree of concurrency, so we can estimate wall time as CPU time divided by
284
+ // the number of unique threads seen, tracked via `ThreadSet`.
285
+ //
286
+ // We also want to support varying thread counts per call site, because the same
287
+ // function/zone may be called from multiple pools. `EndRootRun` calls
288
+ // `CountThreadsAndReset` after each top-level `ThreadPool::Run`, which
289
+ // generates one data point summarized via descriptive statistics. Here we
290
+ // implement a simpler version of `hwy::Stats` because we do not require
291
+ // geomean/variance/kurtosis/skewness. Because concurrency is a small integer,
292
+ // we can simply compute sums rather than online moments. There is also only one
293
+ // instance across all threads, hence we do not require `Assimilate`.
294
+ class ConcurrencyStats {
295
+ public:
296
+ ConcurrencyStats() { Reset(); }
241
297
 
242
- const uint64_t t1 = hn::timer::Stop();
243
- analyze_elapsed_ += t1 - t0;
298
+ void Notify(const size_t x) {
299
+ sum_ += x;
300
+ ++n_;
301
+ min_ = HWY_MIN(min_, x);
302
+ max_ = HWY_MAX(max_, x);
244
303
  }
245
304
 
246
- // Incorporates results from another thread. Call after all threads have
247
- // exited any zones.
248
- void Assimilate(const Results& other) {
249
- namespace hn = HWY_NAMESPACE;
250
- const uint64_t t0 = hn::timer::Start();
251
- HWY_DASSERT(depth_ == 0);
252
- HWY_DASSERT(other.depth_ == 0);
305
+ size_t Count() const { return n_; }
306
+ size_t Min() const { return min_; }
307
+ size_t Max() const { return max_; }
308
+ double Mean() const {
309
+ return static_cast<double>(sum_) / static_cast<double>(n_);
310
+ }
311
+
312
+ void Reset() {
313
+ sum_ = 0;
314
+ n_ = 0;
315
+ min_ = hwy::HighestValue<size_t>();
316
+ max_ = hwy::LowestValue<size_t>();
317
+ }
318
+
319
+ private:
320
+ uint64_t sum_;
321
+ size_t n_;
322
+ size_t min_;
323
+ size_t max_;
324
+ };
325
+ static_assert(sizeof(ConcurrencyStats) == (8 + 3 * sizeof(size_t)), "");
326
+
327
+ // Holds the final results across all threads, including `ConcurrencyStats`.
328
+ // There is only one instance because this is updated by the main thread.
329
+ class Results {
330
+ public:
331
+ void Assimilate(const size_t thread, const size_t zone_idx,
332
+ Accumulator& other) {
333
+ HWY_DASSERT(thread < kMaxThreads);
334
+ HWY_DASSERT(zone_idx < kMaxZones);
335
+ HWY_DASSERT(other.zone.ZoneIdx() == zone_idx);
336
+
337
+ visited_zones_.Set(zone_idx);
338
+ totals_[zone_idx].Assimilate(other);
339
+ threads_[zone_idx].Set(thread);
340
+ }
253
341
 
254
- for (size_t i = 0; i < other.num_zones_; ++i) {
255
- const Accumulator& zone = other.zones_[i];
256
- UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
342
+ // Moves the total number of threads seen during the preceding root-level
343
+ // `ThreadPool::Run` into one data point for `ConcurrencyStats`.
344
+ void CountThreadsAndReset(const size_t zone_idx) {
345
+ HWY_DASSERT(zone_idx < kMaxZones);
346
+ const size_t num_threads = threads_[zone_idx].Count();
347
+ // Although threads_[zone_idx] at one point was non-empty, it is reset
348
+ // below, and so can be empty on the second call to this via `PrintResults`,
349
+ // after one from `EndRootRun`. Do not add a data point if empty.
350
+ if (num_threads != 0) {
351
+ concurrency_[zone_idx].Notify(num_threads);
257
352
  }
258
- const uint64_t t1 = hn::timer::Stop();
259
- analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
353
+ threads_[zone_idx] = ThreadSet();
260
354
  }
261
355
 
262
- // Single-threaded.
263
- void Print() {
264
- namespace hn = HWY_NAMESPACE;
265
- const uint64_t t0 = hn::timer::Start();
266
- MergeDuplicates();
356
+ void CountThreadsAndReset() {
357
+ visited_zones_.Foreach(
358
+ [&](size_t zone_idx) { CountThreadsAndReset(zone_idx); });
359
+ }
267
360
 
268
- // Sort by decreasing total (self) cost.
269
- // VQSort(&zones_[0].u128, num_zones_, SortDescending());
270
- std::sort(zones_, zones_ + num_zones_,
271
- [](const Accumulator& r1, const Accumulator& r2) {
272
- return r1.Duration() > r2.Duration();
273
- });
361
+ void AddAnalysisTime(uint64_t t0) { analyze_elapsed_ += timer::Stop() - t0; }
274
362
 
363
+ void Print(const Names& names) {
364
+ const uint64_t t0 = timer::Start();
275
365
  const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
276
366
 
277
- const char* string_origin = StringOrigin();
278
- for (size_t i = 0; i < num_zones_; ++i) {
279
- const Accumulator& r = zones_[i];
280
- const uint64_t num_calls = r.NumCalls();
281
- printf("%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),
282
- num_calls, r.Duration() / num_calls,
283
- static_cast<double>(r.Duration()) * inv_freq);
367
+ // Sort by decreasing total (self) cost. `totals_` are sparse, so sort an
368
+ // index vector instead.
369
+ std::vector<uint32_t> indices;
370
+ indices.reserve(visited_zones_.Count());
371
+ visited_zones_.Foreach([&](size_t zone_idx) {
372
+ indices.push_back(static_cast<uint32_t>(zone_idx));
373
+ // In case the zone exited after `EndRootRun` and was not yet added.
374
+ CountThreadsAndReset(zone_idx);
375
+ });
376
+ std::sort(indices.begin(), indices.end(), [&](uint32_t a, uint32_t b) {
377
+ return totals_[a].duration > totals_[b].duration;
378
+ });
379
+
380
+ for (uint32_t zone_idx : indices) {
381
+ Accumulator& total = totals_[zone_idx]; // cleared after printing
382
+ HWY_ASSERT(total.zone.ZoneIdx() == zone_idx);
383
+ HWY_ASSERT(total.num_calls != 0); // else visited_zones_ is wrong
384
+
385
+ ConcurrencyStats& concurrency = concurrency_[zone_idx];
386
+ const double duration = static_cast<double>(total.duration);
387
+ const double per_call =
388
+ static_cast<double>(total.duration) / total.num_calls;
389
+ // See comment on `ConcurrencyStats`.
390
+ const double avg_concurrency = concurrency.Mean();
391
+ // Avoid division by zero.
392
+ const double concurrency_divisor = HWY_MAX(1.0, avg_concurrency);
393
+ printf("%s%-40s: %10.0f x %15.0f / %5.1f (%5zu %3zu-%3zu) = %9.6f\n",
394
+ total.zone.IsInclusive() ? "(I)" : " ", names.Get(total.zone),
395
+ static_cast<double>(total.num_calls), per_call, avg_concurrency,
396
+ concurrency.Count(), concurrency.Min(), concurrency.Max(),
397
+ duration * inv_freq / concurrency_divisor);
398
+
399
+ total = Accumulator();
400
+ concurrency.Reset();
401
+ // `threads_` was already reset by `CountThreadsAndReset`.
284
402
  }
403
+ visited_zones_ = ZoneSet();
285
404
 
286
- const uint64_t t1 = hn::timer::Stop();
287
- analyze_elapsed_ += t1 - t0;
405
+ AddAnalysisTime(t0);
288
406
  printf("Total analysis [s]: %f\n",
289
407
  static_cast<double>(analyze_elapsed_) * inv_freq);
408
+ analyze_elapsed_ = 0;
290
409
  }
291
410
 
292
411
  private:
293
- // Updates an existing Accumulator (uniquely identified by biased_offset) or
294
- // adds one if this is the first time this thread analyzed that zone.
295
- // Uses a self-organizing list data structure, which avoids dynamic memory
296
- // allocations and is far faster than unordered_map. Loads, updates and
297
- // stores the entire Accumulator with vector instructions.
298
- void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
299
- const uint64_t duration) {
300
- HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
301
-
302
- // Special case for first zone: (maybe) update, without swapping.
303
- if (zones_[0].BiasedOffset() == biased_offset) {
304
- zones_[0].Add(num_calls, duration);
305
- HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
306
- return;
307
- }
308
-
309
- // Look for a zone with the same offset.
310
- for (size_t i = 1; i < num_zones_; ++i) {
311
- if (zones_[i].BiasedOffset() == biased_offset) {
312
- zones_[i].Add(num_calls, duration);
313
- HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
314
- // Swap with predecessor (more conservative than move to front,
315
- // but at least as successful).
316
- const Accumulator prev = zones_[i - 1];
317
- zones_[i - 1] = zones_[i];
318
- zones_[i] = prev;
319
- return;
320
- }
321
- }
412
+ uint64_t analyze_elapsed_ = 0;
413
+ // Indicates which of the array entries are in use.
414
+ ZoneSet visited_zones_;
415
+ Accumulator totals_[kMaxZones];
416
+ ThreadSet threads_[kMaxZones];
417
+ ConcurrencyStats concurrency_[kMaxZones];
418
+ };
322
419
 
323
- // Not found; create a new Accumulator.
324
- HWY_DASSERT(num_zones_ < kMaxZones);
325
- Accumulator* HWY_RESTRICT zone = zones_ + num_zones_;
326
- zone->Set(biased_offset, num_calls, duration);
327
- HWY_DASSERT(zone->BiasedOffset() == biased_offset);
328
- ++num_zones_;
329
- }
330
-
331
- // Each instantiation of a function template seems to get its own copy of
332
- // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
333
- // acceptable because we only expect a few dozen zones.
334
- void MergeDuplicates() {
335
- const char* string_origin = StringOrigin();
336
- for (size_t i = 0; i < num_zones_; ++i) {
337
- const size_t biased_offset = zones_[i].BiasedOffset();
338
- const char* name = string_origin + biased_offset;
339
- // Separate num_calls from biased_offset so we can add them together.
340
- uint64_t num_calls = zones_[i].NumCalls();
341
-
342
- // Add any subsequent duplicates to num_calls and total_duration.
343
- for (size_t j = i + 1; j < num_zones_;) {
344
- if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
345
- num_calls += zones_[j].NumCalls();
346
- zones_[i].Add(0, zones_[j].Duration());
347
- // Fill hole with last item.
348
- zones_[j] = zones_[--num_zones_];
349
- } else { // Name differed, try next Accumulator.
350
- ++j;
351
- }
352
- }
420
+ // Delay after capturing timestamps before/after the actual zone runs. Even
421
+ // with frequency throttling disabled, this has a multimodal distribution,
422
+ // including 32, 34, 48, 52, 59, 62.
423
+ struct Overheads {
424
+ uint32_t self = 0;
425
+ uint32_t child = 0;
426
+ };
427
+ static_assert(sizeof(Overheads) == 8, "Wrong Overheads size");
353
428
 
354
- HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));
429
+ class Accumulators {
430
+ // We generally want to group threads together because they are often
431
+ // accessed together during a zone, but also want to avoid threads sharing a
432
+ // cache line. Hence interleave 8 zones per thread.
433
+ static constexpr size_t kPerLine = HWY_ALIGNMENT / sizeof(Accumulator);
355
434
 
356
- // Re-pack regardless of whether any duplicates were found.
357
- zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
358
- }
435
+ public:
436
+ Accumulator& Get(const size_t thread, const size_t zone_idx) {
437
+ HWY_DASSERT(thread < kMaxThreads);
438
+ HWY_DASSERT(zone_idx < kMaxZones);
439
+ const size_t line = zone_idx / kPerLine;
440
+ const size_t offset = zone_idx % kPerLine;
441
+ return zones_[(line * kMaxThreads + thread) * kPerLine + offset];
359
442
  }
360
443
 
361
- uint64_t analyze_elapsed_ = 0;
362
- uint64_t self_overhead_ = 0;
363
- uint64_t child_overhead_ = 0;
364
-
365
- size_t depth_ = 0; // Number of active zones.
366
- size_t num_zones_ = 0; // Number of retired zones.
367
-
368
- alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack
369
- alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list
444
+ private:
445
+ Accumulator zones_[kMaxZones * kMaxThreads];
370
446
  };
371
447
 
372
- // Per-thread packet storage, dynamically allocated.
373
- class ThreadSpecific {
374
- static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);
375
-
448
+ // Reacts to zone enter/exit events. Builds a stack of active zones and
449
+ // accumulates self/child duration for each.
450
+ class PerThread {
376
451
  public:
377
- // "name" is used to sanity-check offsets fit in kOffsetBits.
378
- explicit ThreadSpecific(const char* name)
379
- : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),
380
- packets_(AllocateAligned<Packet>(max_packets_)),
381
- num_packets_(0),
382
- string_origin_(StringOrigin()) {
383
- // Even in optimized builds, verify that this zone's name offset fits
384
- // within the allotted space. If not, UpdateOrAdd is likely to overrun
385
- // zones_[]. Checking here on the cold path (only reached once per thread)
386
- // is cheap, but it only covers one zone.
387
- const size_t biased_offset = name - string_origin_;
388
- HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
452
+ template <typename T>
453
+ static T ClampedSubtract(const T minuend, const T subtrahend) {
454
+ static_assert(IsUnsigned<T>(), "");
455
+ const T difference = minuend - subtrahend;
456
+ // Clang output for this is verified to CMOV rather than branch.
457
+ const T no_underflow = (subtrahend > minuend) ? T{0} : ~T{0};
458
+ return difference & no_underflow;
389
459
  }
390
460
 
391
- // Depends on Zone => defined below.
392
- void ComputeOverhead();
461
+ void SetOverheads(const Overheads& overheads) { overheads_ = overheads; }
393
462
 
394
- void WriteEntry(const char* name, const uint64_t timestamp) {
395
- const size_t biased_offset = name - string_origin_;
396
- Write(Packet::Make(biased_offset, timestamp));
463
+ // Entering a zone: push onto stack.
464
+ void Enter(const uint64_t t_enter) {
465
+ const size_t depth = depth_;
466
+ HWY_DASSERT(depth < kMaxDepth);
467
+ t_enter_[depth] = t_enter;
468
+ child_total_[1 + depth] = 0;
469
+ depth_ = 1 + depth;
470
+ HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { any_ = 1; }
397
471
  }
398
472
 
399
- void WriteExit(const uint64_t timestamp) {
400
- const size_t biased_offset = Packet::kOffsetBias;
401
- Write(Packet::Make(biased_offset, timestamp));
473
+ // Exiting the most recently entered zone (top of stack).
474
+ void Exit(const uint64_t t_exit, const size_t thread, const ZoneHandle zone,
475
+ Accumulators& accumulators) {
476
+ HWY_DASSERT(depth_ > 0);
477
+ const size_t depth = depth_ - 1;
478
+ const size_t zone_idx = zone.ZoneIdx();
479
+ const uint64_t duration = t_exit - t_enter_[depth];
480
+ // Clang output for this is verified not to branch. This is 0 if inclusive,
481
+ // otherwise the child total.
482
+ const uint64_t child_total =
483
+ child_total_[1 + depth] & zone.ChildTotalMask();
484
+
485
+ const uint64_t self_duration = ClampedSubtract(
486
+ duration, overheads_.self + overheads_.child + child_total);
487
+ accumulators.Get(thread, zone_idx).Add(zone, self_duration);
488
+ // For faster Assimilate() - not all zones are encountered.
489
+ visited_zones_.Set(zone_idx);
490
+
491
+ // Adding this nested time to the parent's `child_total` will
492
+ // cause it to be later subtracted from the parent's `self_duration`.
493
+ child_total_[1 + depth - 1] += duration + overheads_.child;
494
+
495
+ depth_ = depth;
402
496
  }
403
497
 
404
- void AnalyzeRemainingPackets() {
405
- // Ensures prior weakly-ordered streaming stores are globally visible.
406
- FlushStream();
498
+ bool HadAnyZones() const { return HWY_IS_DEBUG_BUILD ? (any_ != 0) : false; }
407
499
 
408
- // Storage full => empty it.
409
- if (num_packets_ + buffer_size_ > max_packets_) {
410
- results_.AnalyzePackets(packets_.get(), num_packets_);
411
- num_packets_ = 0;
412
- }
413
- CopyBytes(buffer_, packets_.get() + num_packets_,
414
- buffer_size_ * sizeof(Packet));
415
- num_packets_ += buffer_size_;
500
+ // Returns the duration of one enter/exit pair and resets all state. Called
501
+ // via `DetectSelfOverhead`.
502
+ uint64_t GetFirstDurationAndReset(size_t thread, Accumulators& accumulators) {
503
+ HWY_DASSERT(depth_ == 0);
416
504
 
417
- results_.AnalyzePackets(packets_.get(), num_packets_);
418
- num_packets_ = 0;
505
+ HWY_DASSERT(visited_zones_.Count() == 1);
506
+ const size_t zone_idx = visited_zones_.First();
507
+ HWY_DASSERT(zone_idx <= 3);
508
+ HWY_DASSERT(visited_zones_.Get(zone_idx));
509
+ visited_zones_.Clear(zone_idx);
510
+
511
+ Accumulator& zone = accumulators.Get(thread, zone_idx);
512
+ const uint64_t duration = zone.duration;
513
+ zone = Accumulator();
514
+ return duration;
419
515
  }
420
516
 
421
- Results& GetResults() { return results_; }
517
+ // Adds all data to `results` and resets it here. Called from the main thread.
518
+ void MoveTo(const size_t thread, Accumulators& accumulators,
519
+ Results& results) {
520
+ const uint64_t t0 = timer::Start();
521
+
522
+ visited_zones_.Foreach([&](size_t zone_idx) {
523
+ results.Assimilate(thread, zone_idx, accumulators.Get(thread, zone_idx));
524
+ });
525
+ // OK to reset even if we have active zones, because we set `visited_zones_`
526
+ // when exiting the zone.
527
+ visited_zones_ = ZoneSet();
528
+
529
+ results.AddAnalysisTime(t0);
530
+ }
422
531
 
423
532
  private:
424
- // Write packet to buffer/storage, emptying them as needed.
425
- void Write(const Packet packet) {
426
- // Buffer full => copy to storage.
427
- if (buffer_size_ == kBufferCapacity) {
428
- // Storage full => empty it.
429
- if (num_packets_ + kBufferCapacity > max_packets_) {
430
- results_.AnalyzePackets(packets_.get(), num_packets_);
431
- num_packets_ = 0;
432
- }
433
- // This buffering halves observer overhead and decreases the overall
434
- // runtime by about 3%. Casting is safe because the first member is u64.
435
- StreamCacheLine(
436
- reinterpret_cast<const uint64_t*>(buffer_),
437
- reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
438
- num_packets_ += kBufferCapacity;
439
- buffer_size_ = 0;
440
- }
441
- buffer_[buffer_size_] = packet;
442
- ++buffer_size_;
443
- }
444
-
445
- // Write-combining buffer to avoid cache pollution. Must be the first
446
- // non-static member to ensure cache-line alignment.
447
- Packet buffer_[kBufferCapacity];
448
- size_t buffer_size_ = 0;
449
-
450
- const size_t max_packets_;
451
- // Contiguous storage for zone enter/exit packets.
452
- AlignedFreeUniquePtr<Packet[]> packets_;
453
- size_t num_packets_;
454
- // Cached here because we already read this cache line on zone entry/exit.
455
- const char* HWY_RESTRICT string_origin_;
456
- Results results_;
533
+ // 40 bytes:
534
+ ZoneSet visited_zones_; // Which `zones_` have been active on this thread.
535
+ uint64_t depth_ = 0; // Current nesting level for active zones.
536
+ uint64_t any_ = 0;
537
+ Overheads overheads_;
538
+
539
+ uint64_t t_enter_[kMaxDepth];
540
+ // Used to deduct child duration from parent's self time (unless inclusive).
541
+ // Shifting by one avoids bounds-checks for depth_ = 0 (root zone).
542
+ uint64_t child_total_[1 + kMaxDepth] = {0};
457
543
  };
458
544
 
459
- class ThreadList {
545
+ // Enables shift rather than multiplication.
546
+ static_assert(sizeof(PerThread) == 256, "Wrong size");
547
+
548
+ } // namespace profiler
549
+
550
+ class Profiler {
460
551
  public:
461
- // Called from any thread.
462
- ThreadSpecific* Add(const char* name) {
463
- const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
464
- HWY_DASSERT(index < kMaxThreads);
552
+ static HWY_DLLEXPORT Profiler& Get();
553
+
554
+ // Assigns the next counter value to the `thread_local` that `Thread` reads.
555
+ // Must be called exactly once on each thread before any `PROFILER_ZONE`
556
+ // (without a thread argument) are re-entered by multiple threads.
557
+ // `Profiler()` takes care of calling this for the main thread. It is fine not
558
+ // to call it for other threads as long as they only use `PROFILER_ZONE2` or
559
+ // `PROFILER_ZONE3`, which take a thread argument and do not call `Thread`.
560
+ static void InitThread() { s_thread = s_num_threads.fetch_add(1); }
561
+
562
+ // Used by `PROFILER_ZONE/PROFILER_FUNC` to read the `thread` argument from
563
+ // thread_local storage. It is faster to instead pass the ThreadPool `thread`
564
+ // argument to `PROFILER_ZONE2/PROFILER_ZONE3`. Note that the main thread
565
+ // calls `InitThread` first, hence its `Thread` returns zero, which matches
566
+ // the main-first worker numbering used by `ThreadPool`.
567
+ static size_t Thread() { return s_thread; }
568
+
569
+ // Speeds up `UpdateResults` by providing an upper bound on the number of
570
+ // threads tighter than `profiler::kMaxThreads`. It is not required to be
571
+ // tight, and threads less than this can still be unused.
572
+ void SetMaxThreads(size_t max_threads) {
573
+ HWY_ASSERT(max_threads <= profiler::kMaxThreads);
574
+ max_threads_ = max_threads;
575
+ }
576
+
577
+ const char* Name(profiler::ZoneHandle zone) const { return names_.Get(zone); }
578
+
579
+ // Copies `name` into the string table and returns its unique `zone`. Uses
580
+ // linear search, which is fine because this is called during static init.
581
+ // Called via static initializer and the result is passed to the `Zone` ctor.
582
+ profiler::ZoneHandle AddZone(const char* name,
583
+ ProfilerFlags flags = ProfilerFlags::kDefault) {
584
+ return names_.AddZone(name, flags);
585
+ }
586
+
587
+ // For reporting average concurrency. Called by `ThreadPool::Run` on the main
588
+ // thread, returns true if this is the first call since the last `EndRootRun`.
589
+ //
590
+ // We want to report the concurrency of each separate 'invocation' of a zone.
591
+ // A unique per-call identifier (could be approximated with the line number
592
+ // and return address) is not sufficient because the caller may in turn be
593
+ // called from differing parallel sections. A per-`ThreadPool::Run` counter
594
+ // also under-reports concurrency because each pool in nested parallelism
595
+ // (over packages and CCXes) would be considered separate invocations.
596
+ //
597
+ // The alternative of detecting overlapping zones via timestamps is not 100%
598
+ // reliable because timers may not be synchronized across sockets or perhaps
599
+ // even cores. "Invariant" x86 TSCs are indeed synchronized across cores, but
600
+ // not across sockets unless the RESET# signal reaches each at the same time.
601
+ // Linux seems to make an effort to correct this, and Arm's "generic timer"
602
+ // broadcasts to "all cores", but there is no universal guarantee.
603
+ //
604
+ // Under the assumption that all concurrency is via our `ThreadPool`, we can
605
+ // record all `thread` for each outermost (root) `ThreadPool::Run`. This
606
+ // collapses all nested pools into one 'invocation'. We then compute per-zone
607
+ // concurrency as the number of unique `thread` seen per invocation.
608
+ bool IsRootRun() {
609
+ // We are not the root if a Run was already active.
610
+ return !run_active_.test_and_set(std::memory_order_acquire);
611
+ }
465
612
 
466
- ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
467
- threads_[index].store(ts, std::memory_order_release);
468
- return ts;
613
+ // Must be called if `IsRootRun` returned true. Resets the state so that the
614
+ // next call to `IsRootRun` will again return true. Called from main thread.
615
+ // Note that some zones may still be active. Their concurrency will be updated
616
+ // when `PrintResults` is called.
617
+ void EndRootRun() {
618
+ UpdateResults();
619
+ results_.CountThreadsAndReset();
620
+
621
+ run_active_.clear(std::memory_order_release);
469
622
  }
470
623
 
471
- // Single-threaded.
624
+ // Prints results. Call from main thread after all threads have exited all
625
+ // zones. Resets all state, can be called again after more zones.
472
626
  void PrintResults() {
473
- const auto acq = std::memory_order_acquire;
474
- const size_t num_threads = num_threads_.load(acq);
627
+ UpdateResults();
628
+ // `CountThreadsAndReset` is fused into `Print`, so do not call it here.
475
629
 
476
- ThreadSpecific* main = threads_[0].load(acq);
477
- main->AnalyzeRemainingPackets();
630
+ results_.Print(names_);
631
+ }
478
632
 
479
- for (size_t i = 1; i < num_threads; ++i) {
480
- ThreadSpecific* ts = threads_[i].load(acq);
481
- ts->AnalyzeRemainingPackets();
482
- main->GetResults().Assimilate(ts->GetResults());
633
+ // Only for use by Zone; called from any thread.
634
+ profiler::PerThread& GetThread(size_t thread) {
635
+ HWY_DASSERT(thread < profiler::kMaxThreads);
636
+ return threads_[thread];
637
+ }
638
+ profiler::Accumulators& Accumulators() { return accumulators_; }
639
+
640
+ private:
641
+ // Sets main thread index, computes self-overhead, and checks timer support.
642
+ Profiler();
643
+
644
+ // Called from the main thread.
645
+ void UpdateResults() {
646
+ for (size_t thread = 0; thread < max_threads_; ++thread) {
647
+ threads_[thread].MoveTo(thread, accumulators_, results_);
483
648
  }
484
649
 
485
- if (num_threads != 0) {
486
- main->GetResults().Print();
650
+ // Check that all other threads did not have any zones.
651
+ HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) {
652
+ for (size_t thread = max_threads_; thread < profiler::kMaxThreads;
653
+ ++thread) {
654
+ HWY_ASSERT(!threads_[thread].HadAnyZones());
655
+ }
487
656
  }
488
657
  }
489
658
 
490
- private:
491
- // Owning pointers.
492
- alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
493
- std::atomic<size_t> num_threads_{0};
659
+ static thread_local size_t s_thread;
660
+ static std::atomic<size_t> s_num_threads;
661
+ size_t max_threads_ = profiler::kMaxThreads;
662
+
663
+ std::atomic_flag run_active_ = ATOMIC_FLAG_INIT;
664
+
665
+ // To avoid locking, each thread has its own working set. We could access this
666
+ // through `thread_local` pointers, but that is slow to read on x86. Because
667
+ // our `ThreadPool` anyway passes a `thread` argument, we can instead pass
668
+ // that through the `PROFILER_ZONE2/PROFILER_ZONE3` macros.
669
+ profiler::PerThread threads_[profiler::kMaxThreads];
670
+
671
+ profiler::Accumulators accumulators_;
672
+
673
+ // Updated by the main thread after the root `ThreadPool::Run` and during
674
+ // `PrintResults`.
675
+ profiler::ConcurrencyStats concurrency_[profiler::kMaxZones];
676
+
677
+ profiler::Names names_;
678
+
679
+ profiler::Results results_;
494
680
  };
495
681
 
496
- // RAII zone enter/exit recorder constructed by the ZONE macro; also
497
- // responsible for initializing ThreadSpecific.
682
+ namespace profiler {
683
+
684
+ // RAII for zone entry/exit.
498
685
  class Zone {
499
686
  public:
500
- // "name" must be a string literal (see StringOrigin).
501
- HWY_NOINLINE explicit Zone(const char* name) {
687
+ // Thread-compatible; must not be called concurrently with the same `thread`.
688
+ // `thread` must be < `HWY_MIN(kMaxThreads, max_threads_)`, and is typically:
689
+ // - passed from `ThreadPool` via `PROFILER_ZONE2/PROFILER_ZONE3`. NOTE:
690
+ // this value must be unique across all pools, which requires an offset to
691
+ // a nested pool's `thread` argument.
692
+ // - obtained from `Profiler::Thread()`, or
693
+ // - 0 if only a single thread is active.
694
+ Zone(Profiler& profiler, size_t thread, ZoneHandle zone)
695
+ : profiler_(profiler) {
502
696
  HWY_FENCE;
503
- ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();
504
- if (HWY_UNLIKELY(thread_specific == nullptr)) {
505
- // Ensure the CPU supports our timer.
506
- char cpu[100];
507
- if (!platform::HaveTimerStop(cpu)) {
508
- HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
509
- }
510
-
511
- thread_specific = StaticThreadSpecific() = Threads().Add(name);
512
- // Must happen after setting StaticThreadSpecific, because ComputeOverhead
513
- // also calls Zone().
514
- thread_specific->ComputeOverhead();
515
- }
516
-
517
- // (Capture timestamp ASAP, not inside WriteEntry.)
697
+ const uint64_t t_enter = timer::Start();
698
+ HWY_FENCE;
699
+ thread_ = static_cast<uint32_t>(thread);
700
+ zone_ = zone;
701
+ profiler.GetThread(thread).Enter(t_enter);
518
702
  HWY_FENCE;
519
- const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
520
- thread_specific->WriteEntry(name, timestamp);
521
703
  }
522
704
 
523
- HWY_NOINLINE ~Zone() {
705
+ ~Zone() {
524
706
  HWY_FENCE;
525
- const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();
526
- StaticThreadSpecific()->WriteExit(timestamp);
707
+ const uint64_t t_exit = timer::Stop();
708
+ profiler_.GetThread(thread_).Exit(t_exit, thread_, zone_,
709
+ profiler_.Accumulators());
527
710
  HWY_FENCE;
528
711
  }
529
712
 
530
- // Call exactly once after all threads have exited all zones.
531
- static void PrintResults() { Threads().PrintResults(); }
532
-
533
713
  private:
534
- // Returns reference to the thread's ThreadSpecific pointer (initially null).
535
- // Function-local static avoids needing a separate definition.
536
- static ThreadSpecific*& StaticThreadSpecific() {
537
- static thread_local ThreadSpecific* thread_specific;
538
- return thread_specific;
539
- }
540
-
541
- // Returns the singleton ThreadList. Non time-critical.
542
- static ThreadList& Threads() {
543
- static ThreadList threads_;
544
- return threads_;
545
- }
714
+ Profiler& profiler_;
715
+ uint32_t thread_;
716
+ ZoneHandle zone_;
546
717
  };
547
718
 
548
- // Creates a zone starting from here until the end of the current scope.
549
- // Timestamps will be recorded when entering and exiting the zone.
550
- // "name" must be a string literal, which is ensured by merging with "".
551
- #define PROFILER_ZONE(name) \
552
- HWY_FENCE; \
553
- const hwy::Zone zone("" name); \
554
- HWY_FENCE
719
+ } // namespace profiler
720
+ #else // profiler disabled: stub implementation
555
721
 
556
- // Creates a zone for an entire function (when placed at its beginning).
557
- // Shorter/more convenient than ZONE.
558
- #define PROFILER_FUNC \
559
- HWY_FENCE; \
560
- const hwy::Zone zone(__func__); \
561
- HWY_FENCE
722
+ namespace profiler {
723
+ struct ZoneHandle {};
724
+ } // namespace profiler
562
725
 
563
- #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults
564
-
565
- inline void ThreadSpecific::ComputeOverhead() {
566
- namespace hn = HWY_NAMESPACE;
567
- // Delay after capturing timestamps before/after the actual zone runs. Even
568
- // with frequency throttling disabled, this has a multimodal distribution,
569
- // including 32, 34, 48, 52, 59, 62.
570
- uint64_t self_overhead;
571
- {
572
- const size_t kNumSamples = 32;
573
- uint32_t samples[kNumSamples];
574
- for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
575
- const size_t kNumDurations = 1024;
576
- uint32_t durations[kNumDurations];
577
-
578
- for (size_t idx_duration = 0; idx_duration < kNumDurations;
579
- ++idx_duration) {
580
- {
581
- PROFILER_ZONE("Dummy Zone (never shown)");
582
- }
583
- const uint64_t duration = results_.ZoneDuration(buffer_);
584
- buffer_size_ = 0;
585
- durations[idx_duration] = static_cast<uint32_t>(duration);
586
- HWY_DASSERT(num_packets_ == 0);
587
- }
588
- robust_statistics::CountingSort(durations, kNumDurations);
589
- samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
590
- }
591
- // Median.
592
- robust_statistics::CountingSort(samples, kNumSamples);
593
- self_overhead = samples[kNumSamples / 2];
594
- if (PROFILER_PRINT_OVERHEAD) {
595
- printf("Overhead: %zu\n", self_overhead);
596
- }
597
- results_.SetSelfOverhead(self_overhead);
598
- }
599
-
600
- // Delay before capturing start timestamp / after end timestamp.
601
- const size_t kNumSamples = 32;
602
- uint32_t samples[kNumSamples];
603
- for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
604
- const size_t kNumDurations = 16;
605
- uint32_t durations[kNumDurations];
606
- for (size_t idx_duration = 0; idx_duration < kNumDurations;
607
- ++idx_duration) {
608
- const size_t kReps = 10000;
609
- // Analysis time should not be included => must fit within buffer.
610
- HWY_DASSERT(kReps * 2 < max_packets_);
611
- std::atomic_thread_fence(std::memory_order_seq_cst);
612
- const uint64_t t0 = hn::timer::Start();
613
- for (size_t i = 0; i < kReps; ++i) {
614
- PROFILER_ZONE("Dummy");
615
- }
616
- FlushStream();
617
- const uint64_t t1 = hn::timer::Stop();
618
- HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
619
- buffer_size_ = 0;
620
- num_packets_ = 0;
621
- const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
622
- durations[idx_duration] =
623
- static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
624
- }
625
- robust_statistics::CountingSort(durations, kNumDurations);
626
- samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
627
- }
628
- robust_statistics::CountingSort(samples, kNumSamples);
629
- const uint64_t child_overhead = samples[9 * kNumSamples / 10];
630
- if (PROFILER_PRINT_OVERHEAD) {
631
- printf("Child overhead: %zu\n", child_overhead);
726
+ struct Profiler {
727
+ static HWY_DLLEXPORT Profiler& Get();
728
+
729
+ static void InitThread() {}
730
+ static size_t Thread() { return 0; }
731
+ void SetMaxThreads(size_t) {}
732
+
733
+ const char* Name(profiler::ZoneHandle) const { return nullptr; }
734
+ profiler::ZoneHandle AddZone(const char*,
735
+ ProfilerFlags = ProfilerFlags::kDefault) {
736
+ return profiler::ZoneHandle();
632
737
  }
633
- results_.SetChildOverhead(child_overhead);
634
- }
635
738
 
636
- #pragma pack(pop)
739
+ bool IsRootRun() { return false; }
740
+ void EndRootRun() {}
637
741
 
638
- } // namespace hwy
742
+ void PrintResults() {}
743
+ };
744
+
745
+ namespace profiler {
746
+ struct Zone {
747
+ Zone(Profiler&, size_t, ZoneHandle) {}
748
+ };
639
749
 
750
+ } // namespace profiler
640
751
  #endif // PROFILER_ENABLED || HWY_IDE
641
752
 
642
- #if !PROFILER_ENABLED && !HWY_IDE
643
- #define PROFILER_ZONE(name)
644
- #define PROFILER_FUNC
645
- #define PROFILER_PRINT_RESULTS()
646
- #endif
753
+ } // namespace hwy
754
+
755
+ // Creates a `Zone` lvalue with a line-dependent name, which records the elapsed
756
+ // time from here until the end of the current scope. `p` is from
757
+ // `Profiler::Get()` or a cached reference. `thread` is < `kMaxThreads`. `zone`
758
+ // is the return value of `AddZone`. Separating its static init from the `Zone`
759
+ // may be more efficient than `PROFILER_ZONE2`.
760
+ #define PROFILER_ZONE3(p, thread, zone) \
761
+ HWY_FENCE; \
762
+ const hwy::profiler::Zone HWY_CONCAT(Z, __LINE__)(p, thread, zone); \
763
+ HWY_FENCE
764
+
765
+ // For compatibility with old callers that do not pass `p` nor `flags`.
766
+ // Also calls AddZone. Usage: `PROFILER_ZONE2(thread, "MyZone");`
767
+ #define PROFILER_ZONE2(thread, name) \
768
+ static const hwy::profiler::ZoneHandle HWY_CONCAT(zone, __LINE__) = \
769
+ hwy::Profiler::Get().AddZone(name); \
770
+ PROFILER_ZONE3(hwy::Profiler::Get(), thread, HWY_CONCAT(zone, __LINE__))
771
+ #define PROFILER_FUNC2(thread) PROFILER_ZONE2(thread, __func__)
772
+
773
+ // OBSOLETE: it is more efficient to pass `thread` from `ThreadPool` to
774
+ // `PROFILER_ZONE2/PROFILER_ZONE3`. Here we get it from thread_local storage.
775
+ #define PROFILER_ZONE(name) PROFILER_ZONE2(hwy::Profiler::Thread(), name)
776
+ #define PROFILER_FUNC PROFILER_FUNC2(hwy::Profiler::Thread())
777
+
778
+ // DEPRECATED: Use `hwy::Profiler::Get()` directly instead.
779
+ #define PROFILER_ADD_ZONE(name) hwy::Profiler::Get().AddZone(name)
780
+ #define PROFILER_IS_ROOT_RUN() hwy::Profiler::Get().IsRootRun()
781
+ #define PROFILER_END_ROOT_RUN() hwy::Profiler::Get().EndRootRun()
782
+ #define PROFILER_PRINT_RESULTS() hwy::Profiler::Get().PrintResults()
647
783
 
648
784
  #endif // HIGHWAY_HWY_PROFILER_H_