@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +8 -8
package/include/hwy/profiler.h
CHANGED
|
@@ -15,6 +15,11 @@
|
|
|
15
15
|
#ifndef HIGHWAY_HWY_PROFILER_H_
|
|
16
16
|
#define HIGHWAY_HWY_PROFILER_H_
|
|
17
17
|
|
|
18
|
+
#include <stddef.h>
|
|
19
|
+
#include <stdint.h>
|
|
20
|
+
|
|
21
|
+
#include "hwy/highway_export.h"
|
|
22
|
+
|
|
18
23
|
// High precision, low overhead time measurements. Returns exact call counts and
|
|
19
24
|
// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
|
|
20
25
|
//
|
|
@@ -22,627 +27,758 @@
|
|
|
22
27
|
// { PROFILER_ZONE("name"); /*code*/ } or
|
|
23
28
|
// the name of the current function:
|
|
24
29
|
// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
|
|
30
|
+
// You can reduce the overhead by passing a thread ID:
|
|
31
|
+
// `PROFILER_ZONE2(thread, name)`. The new and preferred API also allows
|
|
32
|
+
// passing flags, such as requesting inclusive time:
|
|
33
|
+
// `static const auto zone = profiler.AddZone("name", flags);` and then
|
|
34
|
+
// `PROFILER_ZONE3(profiler, thread, zone)`.
|
|
25
35
|
//
|
|
26
|
-
// After all threads
|
|
36
|
+
// After all threads exit all zones, call `Profiler::Get().PrintResults()` to
|
|
27
37
|
// print call counts and average durations [CPU cycles] to stdout, sorted in
|
|
28
38
|
// descending order of total duration.
|
|
29
|
-
//
|
|
30
|
-
// The binary MUST be built with --dynamic_mode=off because we rely on the data
|
|
31
|
-
// segments being nearby; if not, an assertion will likely fail.
|
|
32
39
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
// Configuration settings:
|
|
36
|
-
|
|
37
|
-
// If zero, this file has no effect and no measurements will be recorded.
|
|
40
|
+
// If zero, mock `Profiler` and `profiler::Zone` will be defined.
|
|
38
41
|
#ifndef PROFILER_ENABLED
|
|
39
42
|
#define PROFILER_ENABLED 0
|
|
40
43
|
#endif
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
// enters at least one zone. Once this buffer is full, the thread will analyze
|
|
44
|
-
// and discard packets, thus temporarily adding some observer overhead.
|
|
45
|
-
// Each zone occupies 16 bytes.
|
|
46
|
-
#ifndef PROFILER_THREAD_STORAGE
|
|
47
|
-
#define PROFILER_THREAD_STORAGE 200ULL
|
|
48
|
-
#endif
|
|
49
|
-
|
|
50
|
-
#if PROFILER_ENABLED || HWY_IDE
|
|
51
|
-
|
|
52
|
-
#include <stddef.h>
|
|
53
|
-
#include <stdint.h>
|
|
45
|
+
#if PROFILER_ENABLED
|
|
54
46
|
#include <stdio.h>
|
|
55
|
-
#include <string.h> // strcmp
|
|
47
|
+
#include <string.h> // strcmp, strlen
|
|
56
48
|
|
|
57
49
|
#include <algorithm> // std::sort
|
|
58
50
|
#include <atomic>
|
|
51
|
+
#include <vector>
|
|
59
52
|
|
|
60
53
|
#include "hwy/aligned_allocator.h"
|
|
61
|
-
#include "hwy/
|
|
62
|
-
|
|
63
|
-
#include "hwy/highway.h" // Stream
|
|
64
|
-
#include "hwy/robust_statistics.h"
|
|
65
|
-
#include "hwy/timer-inl.h"
|
|
54
|
+
#include "hwy/base.h"
|
|
55
|
+
#include "hwy/bit_set.h"
|
|
66
56
|
#include "hwy/timer.h"
|
|
67
|
-
|
|
68
|
-
#define PROFILER_PRINT_OVERHEAD 0
|
|
57
|
+
#endif // PROFILER_ENABLED
|
|
69
58
|
|
|
70
59
|
namespace hwy {
|
|
71
60
|
|
|
72
|
-
//
|
|
61
|
+
// Flags: we want type-safety (enum class) to catch mistakes such as confusing
|
|
62
|
+
// zone with flags. Base type (`uint32_t`) ensures it is safe to cast. Defined
|
|
63
|
+
// outside the `#if` because callers pass them to `PROFILER_ZONE3`. Keep in
|
|
64
|
+
// sync with `kNumFlags` below.
|
|
65
|
+
enum class ProfilerFlags : uint32_t {
|
|
66
|
+
kDefault = 0,
|
|
67
|
+
// The zone should report cumulative time, including all child zones. If not
|
|
68
|
+
// specified, zones report self-time, excluding child zones.
|
|
69
|
+
kInclusive = 1
|
|
70
|
+
};
|
|
73
71
|
|
|
74
|
-
|
|
75
|
-
// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
|
|
76
|
-
// WARNING: a fiber library can spawn hundreds of threads.
|
|
77
|
-
static constexpr size_t kMaxThreads = 256;
|
|
72
|
+
#if PROFILER_ENABLED
|
|
78
73
|
|
|
79
|
-
|
|
74
|
+
// Implementation details.
|
|
75
|
+
namespace profiler {
|
|
80
76
|
|
|
81
|
-
|
|
77
|
+
HWY_INLINE_VAR constexpr size_t kNumFlags = 1;
|
|
82
78
|
|
|
83
|
-
//
|
|
84
|
-
// Both pointers must be aligned.
|
|
85
|
-
HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
|
|
86
|
-
uint64_t* HWY_RESTRICT to) {
|
|
87
|
-
namespace hn = HWY_NAMESPACE;
|
|
88
|
-
const hn::ScalableTag<uint64_t> d;
|
|
89
|
-
for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {
|
|
90
|
-
hn::Stream(hn::Load(d, from + i), d, to + i);
|
|
91
|
-
}
|
|
92
|
-
}
|
|
79
|
+
// Upper bounds for fixed-size data structures, guarded via HWY_DASSERT:
|
|
93
80
|
|
|
94
|
-
|
|
81
|
+
// Maximum nesting of zones, chosen such that PerThread is 256 bytes.
|
|
82
|
+
HWY_INLINE_VAR constexpr size_t kMaxDepth = 13;
|
|
83
|
+
// Reports with more than ~50 are anyway difficult to read.
|
|
84
|
+
HWY_INLINE_VAR constexpr size_t kMaxZones = 128;
|
|
85
|
+
// Upper bound on threads that call `InitThread`, and `thread` arguments. Note
|
|
86
|
+
// that fiber libraries can spawn hundreds of threads. Enough for Turin cores.
|
|
87
|
+
HWY_INLINE_VAR constexpr size_t kMaxThreads = 256;
|
|
95
88
|
|
|
96
|
-
//
|
|
97
|
-
|
|
98
|
-
class Packet {
|
|
89
|
+
// Type-safe wrapper for zone index plus flags, returned by `AddZone`.
|
|
90
|
+
class ZoneHandle {
|
|
99
91
|
public:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
92
|
+
ZoneHandle() : bits_(0) {} // for Accumulator member initialization
|
|
93
|
+
|
|
94
|
+
ZoneHandle(size_t zone_idx, ProfilerFlags flags) {
|
|
95
|
+
HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones);
|
|
96
|
+
const uint32_t flags_u = static_cast<uint32_t>(flags);
|
|
97
|
+
HWY_DASSERT(flags_u < (1u << kNumFlags));
|
|
98
|
+
bits_ = (static_cast<uint32_t>(zone_idx) << kNumFlags) | flags_u;
|
|
99
|
+
HWY_DASSERT(ZoneIdx() == zone_idx);
|
|
100
|
+
}
|
|
104
101
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
// multiple zones). Wraparound is handled by masking.
|
|
108
|
-
static constexpr size_t kTimestampBits = 64 - kOffsetBits;
|
|
109
|
-
static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
|
|
102
|
+
ZoneHandle(const ZoneHandle& other) = default;
|
|
103
|
+
ZoneHandle& operator=(const ZoneHandle& other) = default;
|
|
110
104
|
|
|
111
|
-
|
|
112
|
-
|
|
105
|
+
bool operator==(const ZoneHandle other) const { return bits_ == other.bits_; }
|
|
106
|
+
bool operator!=(const ZoneHandle other) const { return bits_ != other.bits_; }
|
|
113
107
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
108
|
+
size_t ZoneIdx() const {
|
|
109
|
+
HWY_DASSERT(bits_ != 0);
|
|
110
|
+
const size_t zone_idx = bits_ >> kNumFlags;
|
|
111
|
+
HWY_DASSERT(0 != zone_idx && zone_idx < kMaxZones);
|
|
112
|
+
return zone_idx;
|
|
118
113
|
}
|
|
119
114
|
|
|
120
|
-
|
|
115
|
+
bool IsInclusive() const {
|
|
116
|
+
HWY_DASSERT(bits_ != 0);
|
|
117
|
+
return (bits_ & static_cast<uint32_t>(ProfilerFlags::kInclusive)) != 0;
|
|
118
|
+
}
|
|
121
119
|
|
|
122
|
-
|
|
120
|
+
// Returns a mask to zero/ignore child totals for inclusive zones.
|
|
121
|
+
uint64_t ChildTotalMask() const {
|
|
122
|
+
// Without this function, clang tends to generate a branch.
|
|
123
|
+
return IsInclusive() ? 0 : ~uint64_t{0};
|
|
124
|
+
}
|
|
123
125
|
|
|
124
126
|
private:
|
|
125
|
-
|
|
127
|
+
uint32_t bits_;
|
|
126
128
|
};
|
|
127
|
-
static_assert(sizeof(Packet) == 8, "Wrong Packet size");
|
|
128
129
|
|
|
129
|
-
//
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
//
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
130
|
+
// Storage for zone names.
|
|
131
|
+
class Names {
|
|
132
|
+
static constexpr std::memory_order kRel = std::memory_order_relaxed;
|
|
133
|
+
|
|
134
|
+
public:
|
|
135
|
+
// Returns a copy of the `name` passed to `AddZone` that returned the
|
|
136
|
+
// given `zone`.
|
|
137
|
+
const char* Get(ZoneHandle zone) const { return ptrs_[zone.ZoneIdx()]; }
|
|
138
|
+
|
|
139
|
+
ZoneHandle AddZone(const char* name, ProfilerFlags flags) {
|
|
140
|
+
// Linear search whether it already exists.
|
|
141
|
+
const size_t num_zones = next_ptr_.load(kRel);
|
|
142
|
+
HWY_ASSERT(num_zones < kMaxZones);
|
|
143
|
+
for (size_t zone_idx = 1; zone_idx < num_zones; ++zone_idx) {
|
|
144
|
+
if (!strcmp(ptrs_[zone_idx], name)) {
|
|
145
|
+
return ZoneHandle(zone_idx, flags);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Reserve the next `zone_idx` (index in `ptrs_`).
|
|
150
|
+
const size_t zone_idx = next_ptr_.fetch_add(1, kRel);
|
|
151
|
+
|
|
152
|
+
// Copy into `name` into `chars_`.
|
|
153
|
+
const size_t len = strlen(name) + 1;
|
|
154
|
+
const size_t pos = next_char_.fetch_add(len, kRel);
|
|
155
|
+
HWY_ASSERT(pos + len <= sizeof(chars_));
|
|
156
|
+
strcpy(chars_ + pos, name); // NOLINT
|
|
157
|
+
|
|
158
|
+
ptrs_[zone_idx] = chars_ + pos;
|
|
159
|
+
const ZoneHandle zone(zone_idx, flags);
|
|
160
|
+
HWY_DASSERT(!strcmp(Get(zone), name));
|
|
161
|
+
return zone;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
private:
|
|
165
|
+
const char* ptrs_[kMaxZones];
|
|
166
|
+
std::atomic<size_t> next_ptr_{1}; // next zone_idx
|
|
167
|
+
char chars_[kMaxZones * 70];
|
|
168
|
+
std::atomic<size_t> next_char_{0};
|
|
147
169
|
};
|
|
148
|
-
static_assert(sizeof(Node) == 16, "Wrong Node size");
|
|
149
170
|
|
|
150
|
-
// Holds
|
|
171
|
+
// Holds total duration and number of calls. "Which thread entered it" is
|
|
172
|
+
// unnecessary because these are per-thread.
|
|
151
173
|
struct Accumulator {
|
|
152
|
-
|
|
174
|
+
void Add(ZoneHandle new_zone, uint64_t self_duration) {
|
|
175
|
+
duration += self_duration;
|
|
153
176
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
177
|
+
// Only called for valid zones.
|
|
178
|
+
HWY_DASSERT(new_zone != ZoneHandle());
|
|
179
|
+
// Our zone might not have been set yet.
|
|
180
|
+
HWY_DASSERT(zone == ZoneHandle() || zone == new_zone);
|
|
181
|
+
zone = new_zone;
|
|
157
182
|
|
|
158
|
-
|
|
159
|
-
u128.hi = duration;
|
|
160
|
-
u128.lo = (biased_offset << kNumCallBits) + num_calls;
|
|
183
|
+
num_calls += 1;
|
|
161
184
|
}
|
|
162
185
|
|
|
163
|
-
void
|
|
164
|
-
|
|
165
|
-
|
|
186
|
+
void Assimilate(Accumulator& other) {
|
|
187
|
+
duration += other.duration;
|
|
188
|
+
other.duration = 0;
|
|
189
|
+
|
|
190
|
+
// `ZoneSet` ensures we only call this for non-empty `other`.
|
|
191
|
+
HWY_DASSERT(other.zone != ZoneHandle());
|
|
192
|
+
// Our zone might not have been set yet.
|
|
193
|
+
HWY_DASSERT(zone == ZoneHandle() || zone == other.zone);
|
|
194
|
+
zone = other.zone;
|
|
195
|
+
|
|
196
|
+
num_calls += other.num_calls;
|
|
197
|
+
other.num_calls = 0;
|
|
166
198
|
}
|
|
167
199
|
|
|
168
|
-
|
|
169
|
-
//
|
|
170
|
-
|
|
200
|
+
uint64_t duration = 0;
|
|
201
|
+
ZoneHandle zone; // flags are used by `Results::Print`
|
|
202
|
+
uint32_t num_calls = 0;
|
|
171
203
|
};
|
|
172
204
|
static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
|
|
173
205
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
return 0;
|
|
178
|
-
}
|
|
179
|
-
return minuend - subtrahend;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
// Per-thread call graph (stack) and Accumulator for each zone.
|
|
183
|
-
class Results {
|
|
206
|
+
// Modified from `hwy::BitSet4096`. Avoids the second-level `BitSet64`, because
|
|
207
|
+
// we only need `kMaxZones` = 128.
|
|
208
|
+
class ZoneSet {
|
|
184
209
|
public:
|
|
185
|
-
|
|
210
|
+
// No harm if `i` is already set.
|
|
211
|
+
void Set(size_t i) {
|
|
212
|
+
HWY_DASSERT(i < kMaxZones);
|
|
213
|
+
const size_t idx = i / 64;
|
|
214
|
+
const size_t mod = i % 64;
|
|
215
|
+
bits_[idx].Set(mod);
|
|
216
|
+
HWY_DASSERT(Get(i));
|
|
217
|
+
}
|
|
186
218
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
const uint64_t duration = zones_[0].Duration();
|
|
194
|
-
zones_[0].Set(0, 0, 0);
|
|
195
|
-
HWY_DASSERT(depth_ == 0);
|
|
196
|
-
num_zones_ = 0;
|
|
197
|
-
return duration;
|
|
219
|
+
void Clear(size_t i) {
|
|
220
|
+
HWY_DASSERT(i < kMaxZones);
|
|
221
|
+
const size_t idx = i / 64;
|
|
222
|
+
const size_t mod = i % 64;
|
|
223
|
+
bits_[idx].Clear(mod);
|
|
224
|
+
HWY_DASSERT(!Get(i));
|
|
198
225
|
}
|
|
199
226
|
|
|
200
|
-
|
|
201
|
-
|
|
227
|
+
bool Get(size_t i) const {
|
|
228
|
+
HWY_DASSERT(i < kMaxZones);
|
|
229
|
+
const size_t idx = i / 64;
|
|
230
|
+
const size_t mod = i % 64;
|
|
231
|
+
return bits_[idx].Get(mod);
|
|
202
232
|
}
|
|
203
233
|
|
|
204
|
-
|
|
205
|
-
|
|
234
|
+
// Returns lowest i such that Get(i). Caller must ensure Any() beforehand!
|
|
235
|
+
size_t First() const {
|
|
236
|
+
HWY_DASSERT(bits_[0].Any() || bits_[1].Any());
|
|
237
|
+
const size_t idx = bits_[0].Any() ? 0 : 1;
|
|
238
|
+
return idx * 64 + bits_[idx].First();
|
|
206
239
|
}
|
|
207
240
|
|
|
208
|
-
//
|
|
209
|
-
//
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
241
|
+
// Calls `func(i)` for each `i` in the set. It is safe for `func` to modify
|
|
242
|
+
// the set, but the current Foreach call is only affected if changing one of
|
|
243
|
+
// the not yet visited BitSet64 for which Any() is true.
|
|
244
|
+
template <class Func>
|
|
245
|
+
void Foreach(const Func& func) const {
|
|
246
|
+
bits_[0].Foreach([&func](size_t mod) { func(mod); });
|
|
247
|
+
bits_[1].Foreach([&func](size_t mod) { func(64 + mod); });
|
|
248
|
+
}
|
|
213
249
|
|
|
214
|
-
|
|
215
|
-
const Packet p = packets[i];
|
|
216
|
-
// Entering a zone
|
|
217
|
-
if (p.BiasedOffset() != Packet::kOffsetBias) {
|
|
218
|
-
HWY_DASSERT(depth_ < kMaxDepth);
|
|
219
|
-
nodes_[depth_].packet = p;
|
|
220
|
-
nodes_[depth_].child_total = 0;
|
|
221
|
-
++depth_;
|
|
222
|
-
continue;
|
|
223
|
-
}
|
|
250
|
+
size_t Count() const { return bits_[0].Count() + bits_[1].Count(); }
|
|
224
251
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
(p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
|
|
230
|
-
const uint64_t self_duration = ClampedSubtract(
|
|
231
|
-
duration, self_overhead_ + child_overhead_ + node.child_total);
|
|
252
|
+
private:
|
|
253
|
+
static_assert(kMaxZones == 128, "Update ZoneSet");
|
|
254
|
+
BitSet64 bits_[2];
|
|
255
|
+
};
|
|
232
256
|
|
|
233
|
-
|
|
234
|
-
|
|
257
|
+
// Modified from `ZoneSet`.
|
|
258
|
+
class ThreadSet {
|
|
259
|
+
public:
|
|
260
|
+
// No harm if `i` is already set.
|
|
261
|
+
void Set(size_t i) {
|
|
262
|
+
HWY_DASSERT(i < kMaxThreads);
|
|
263
|
+
const size_t idx = i / 64;
|
|
264
|
+
const size_t mod = i % 64;
|
|
265
|
+
bits_[idx].Set(mod);
|
|
266
|
+
}
|
|
235
267
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
268
|
+
size_t Count() const {
|
|
269
|
+
size_t total = 0;
|
|
270
|
+
for (const BitSet64& bits : bits_) {
|
|
271
|
+
total += bits.Count();
|
|
240
272
|
}
|
|
273
|
+
return total;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
private:
|
|
277
|
+
BitSet64 bits_[DivCeil(kMaxThreads, size_t{64})];
|
|
278
|
+
};
|
|
279
|
+
|
|
280
|
+
// Durations are per-CPU, but end to end performance is defined by wall time.
|
|
281
|
+
// Assuming fork-join parallelism, zones are entered by multiple threads
|
|
282
|
+
// concurrently, which means the total number of unique threads is also the
|
|
283
|
+
// degree of concurrency, so we can estimate wall time as CPU time divided by
|
|
284
|
+
// the number of unique threads seen, tracked via `ThreadSet`.
|
|
285
|
+
//
|
|
286
|
+
// We also want to support varying thread counts per call site, because the same
|
|
287
|
+
// function/zone may be called from multiple pools. `EndRootRun` calls
|
|
288
|
+
// `CountThreadsAndReset` after each top-level `ThreadPool::Run`, which
|
|
289
|
+
// generates one data point summarized via descriptive statistics. Here we
|
|
290
|
+
// implement a simpler version of `hwy::Stats` because we do not require
|
|
291
|
+
// geomean/variance/kurtosis/skewness. Because concurrency is a small integer,
|
|
292
|
+
// we can simply compute sums rather than online moments. There is also only one
|
|
293
|
+
// instance across all threads, hence we do not require `Assimilate`.
|
|
294
|
+
class ConcurrencyStats {
|
|
295
|
+
public:
|
|
296
|
+
ConcurrencyStats() { Reset(); }
|
|
241
297
|
|
|
242
|
-
|
|
243
|
-
|
|
298
|
+
void Notify(const size_t x) {
|
|
299
|
+
sum_ += x;
|
|
300
|
+
++n_;
|
|
301
|
+
min_ = HWY_MIN(min_, x);
|
|
302
|
+
max_ = HWY_MAX(max_, x);
|
|
244
303
|
}
|
|
245
304
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
305
|
+
size_t Count() const { return n_; }
|
|
306
|
+
size_t Min() const { return min_; }
|
|
307
|
+
size_t Max() const { return max_; }
|
|
308
|
+
double Mean() const {
|
|
309
|
+
return static_cast<double>(sum_) / static_cast<double>(n_);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
void Reset() {
|
|
313
|
+
sum_ = 0;
|
|
314
|
+
n_ = 0;
|
|
315
|
+
min_ = hwy::HighestValue<size_t>();
|
|
316
|
+
max_ = hwy::LowestValue<size_t>();
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
private:
|
|
320
|
+
uint64_t sum_;
|
|
321
|
+
size_t n_;
|
|
322
|
+
size_t min_;
|
|
323
|
+
size_t max_;
|
|
324
|
+
};
|
|
325
|
+
static_assert(sizeof(ConcurrencyStats) == (8 + 3 * sizeof(size_t)), "");
|
|
326
|
+
|
|
327
|
+
// Holds the final results across all threads, including `ConcurrencyStats`.
|
|
328
|
+
// There is only one instance because this is updated by the main thread.
|
|
329
|
+
class Results {
|
|
330
|
+
public:
|
|
331
|
+
void Assimilate(const size_t thread, const size_t zone_idx,
|
|
332
|
+
Accumulator& other) {
|
|
333
|
+
HWY_DASSERT(thread < kMaxThreads);
|
|
334
|
+
HWY_DASSERT(zone_idx < kMaxZones);
|
|
335
|
+
HWY_DASSERT(other.zone.ZoneIdx() == zone_idx);
|
|
336
|
+
|
|
337
|
+
visited_zones_.Set(zone_idx);
|
|
338
|
+
totals_[zone_idx].Assimilate(other);
|
|
339
|
+
threads_[zone_idx].Set(thread);
|
|
340
|
+
}
|
|
253
341
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
342
|
+
// Moves the total number of threads seen during the preceding root-level
|
|
343
|
+
// `ThreadPool::Run` into one data point for `ConcurrencyStats`.
|
|
344
|
+
void CountThreadsAndReset(const size_t zone_idx) {
|
|
345
|
+
HWY_DASSERT(zone_idx < kMaxZones);
|
|
346
|
+
const size_t num_threads = threads_[zone_idx].Count();
|
|
347
|
+
// Although threads_[zone_idx] at one point was non-empty, it is reset
|
|
348
|
+
// below, and so can be empty on the second call to this via `PrintResults`,
|
|
349
|
+
// after one from `EndRootRun`. Do not add a data point if empty.
|
|
350
|
+
if (num_threads != 0) {
|
|
351
|
+
concurrency_[zone_idx].Notify(num_threads);
|
|
257
352
|
}
|
|
258
|
-
|
|
259
|
-
analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
|
|
353
|
+
threads_[zone_idx] = ThreadSet();
|
|
260
354
|
}
|
|
261
355
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
MergeDuplicates();
|
|
356
|
+
void CountThreadsAndReset() {
|
|
357
|
+
visited_zones_.Foreach(
|
|
358
|
+
[&](size_t zone_idx) { CountThreadsAndReset(zone_idx); });
|
|
359
|
+
}
|
|
267
360
|
|
|
268
|
-
|
|
269
|
-
// VQSort(&zones_[0].u128, num_zones_, SortDescending());
|
|
270
|
-
std::sort(zones_, zones_ + num_zones_,
|
|
271
|
-
[](const Accumulator& r1, const Accumulator& r2) {
|
|
272
|
-
return r1.Duration() > r2.Duration();
|
|
273
|
-
});
|
|
361
|
+
void AddAnalysisTime(uint64_t t0) { analyze_elapsed_ += timer::Stop() - t0; }
|
|
274
362
|
|
|
363
|
+
void Print(const Names& names) {
|
|
364
|
+
const uint64_t t0 = timer::Start();
|
|
275
365
|
const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
|
|
276
366
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
367
|
+
// Sort by decreasing total (self) cost. `totals_` are sparse, so sort an
|
|
368
|
+
// index vector instead.
|
|
369
|
+
std::vector<uint32_t> indices;
|
|
370
|
+
indices.reserve(visited_zones_.Count());
|
|
371
|
+
visited_zones_.Foreach([&](size_t zone_idx) {
|
|
372
|
+
indices.push_back(static_cast<uint32_t>(zone_idx));
|
|
373
|
+
// In case the zone exited after `EndRootRun` and was not yet added.
|
|
374
|
+
CountThreadsAndReset(zone_idx);
|
|
375
|
+
});
|
|
376
|
+
std::sort(indices.begin(), indices.end(), [&](uint32_t a, uint32_t b) {
|
|
377
|
+
return totals_[a].duration > totals_[b].duration;
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
for (uint32_t zone_idx : indices) {
|
|
381
|
+
Accumulator& total = totals_[zone_idx]; // cleared after printing
|
|
382
|
+
HWY_ASSERT(total.zone.ZoneIdx() == zone_idx);
|
|
383
|
+
HWY_ASSERT(total.num_calls != 0); // else visited_zones_ is wrong
|
|
384
|
+
|
|
385
|
+
ConcurrencyStats& concurrency = concurrency_[zone_idx];
|
|
386
|
+
const double duration = static_cast<double>(total.duration);
|
|
387
|
+
const double per_call =
|
|
388
|
+
static_cast<double>(total.duration) / total.num_calls;
|
|
389
|
+
// See comment on `ConcurrencyStats`.
|
|
390
|
+
const double avg_concurrency = concurrency.Mean();
|
|
391
|
+
// Avoid division by zero.
|
|
392
|
+
const double concurrency_divisor = HWY_MAX(1.0, avg_concurrency);
|
|
393
|
+
printf("%s%-40s: %10.0f x %15.0f / %5.1f (%5zu %3zu-%3zu) = %9.6f\n",
|
|
394
|
+
total.zone.IsInclusive() ? "(I)" : " ", names.Get(total.zone),
|
|
395
|
+
static_cast<double>(total.num_calls), per_call, avg_concurrency,
|
|
396
|
+
concurrency.Count(), concurrency.Min(), concurrency.Max(),
|
|
397
|
+
duration * inv_freq / concurrency_divisor);
|
|
398
|
+
|
|
399
|
+
total = Accumulator();
|
|
400
|
+
concurrency.Reset();
|
|
401
|
+
// `threads_` was already reset by `CountThreadsAndReset`.
|
|
284
402
|
}
|
|
403
|
+
visited_zones_ = ZoneSet();
|
|
285
404
|
|
|
286
|
-
|
|
287
|
-
analyze_elapsed_ += t1 - t0;
|
|
405
|
+
AddAnalysisTime(t0);
|
|
288
406
|
printf("Total analysis [s]: %f\n",
|
|
289
407
|
static_cast<double>(analyze_elapsed_) * inv_freq);
|
|
408
|
+
analyze_elapsed_ = 0;
|
|
290
409
|
}
|
|
291
410
|
|
|
292
411
|
private:
|
|
293
|
-
|
|
294
|
-
//
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
|
|
301
|
-
|
|
302
|
-
// Special case for first zone: (maybe) update, without swapping.
|
|
303
|
-
if (zones_[0].BiasedOffset() == biased_offset) {
|
|
304
|
-
zones_[0].Add(num_calls, duration);
|
|
305
|
-
HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
|
|
306
|
-
return;
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
// Look for a zone with the same offset.
|
|
310
|
-
for (size_t i = 1; i < num_zones_; ++i) {
|
|
311
|
-
if (zones_[i].BiasedOffset() == biased_offset) {
|
|
312
|
-
zones_[i].Add(num_calls, duration);
|
|
313
|
-
HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
|
|
314
|
-
// Swap with predecessor (more conservative than move to front,
|
|
315
|
-
// but at least as successful).
|
|
316
|
-
const Accumulator prev = zones_[i - 1];
|
|
317
|
-
zones_[i - 1] = zones_[i];
|
|
318
|
-
zones_[i] = prev;
|
|
319
|
-
return;
|
|
320
|
-
}
|
|
321
|
-
}
|
|
412
|
+
uint64_t analyze_elapsed_ = 0;
|
|
413
|
+
// Indicates which of the array entries are in use.
|
|
414
|
+
ZoneSet visited_zones_;
|
|
415
|
+
Accumulator totals_[kMaxZones];
|
|
416
|
+
ThreadSet threads_[kMaxZones];
|
|
417
|
+
ConcurrencyStats concurrency_[kMaxZones];
|
|
418
|
+
};
|
|
322
419
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
// Each instantiation of a function template seems to get its own copy of
|
|
332
|
-
// __func__ and GCC doesn't merge them. An N^2 search for duplicates is
|
|
333
|
-
// acceptable because we only expect a few dozen zones.
|
|
334
|
-
void MergeDuplicates() {
|
|
335
|
-
const char* string_origin = StringOrigin();
|
|
336
|
-
for (size_t i = 0; i < num_zones_; ++i) {
|
|
337
|
-
const size_t biased_offset = zones_[i].BiasedOffset();
|
|
338
|
-
const char* name = string_origin + biased_offset;
|
|
339
|
-
// Separate num_calls from biased_offset so we can add them together.
|
|
340
|
-
uint64_t num_calls = zones_[i].NumCalls();
|
|
341
|
-
|
|
342
|
-
// Add any subsequent duplicates to num_calls and total_duration.
|
|
343
|
-
for (size_t j = i + 1; j < num_zones_;) {
|
|
344
|
-
if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
|
|
345
|
-
num_calls += zones_[j].NumCalls();
|
|
346
|
-
zones_[i].Add(0, zones_[j].Duration());
|
|
347
|
-
// Fill hole with last item.
|
|
348
|
-
zones_[j] = zones_[--num_zones_];
|
|
349
|
-
} else { // Name differed, try next Accumulator.
|
|
350
|
-
++j;
|
|
351
|
-
}
|
|
352
|
-
}
|
|
420
|
+
// Delay after capturing timestamps before/after the actual zone runs. Even
|
|
421
|
+
// with frequency throttling disabled, this has a multimodal distribution,
|
|
422
|
+
// including 32, 34, 48, 52, 59, 62.
|
|
423
|
+
struct Overheads {
|
|
424
|
+
uint32_t self = 0;
|
|
425
|
+
uint32_t child = 0;
|
|
426
|
+
};
|
|
427
|
+
static_assert(sizeof(Overheads) == 8, "Wrong Overheads size");
|
|
353
428
|
|
|
354
|
-
|
|
429
|
+
class Accumulators {
|
|
430
|
+
// We generally want to group threads together because they are often
|
|
431
|
+
// accessed together during a zone, but also want to avoid threads sharing a
|
|
432
|
+
// cache line. Hence interleave 8 zones per thread.
|
|
433
|
+
static constexpr size_t kPerLine = HWY_ALIGNMENT / sizeof(Accumulator);
|
|
355
434
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
435
|
+
public:
|
|
436
|
+
Accumulator& Get(const size_t thread, const size_t zone_idx) {
|
|
437
|
+
HWY_DASSERT(thread < kMaxThreads);
|
|
438
|
+
HWY_DASSERT(zone_idx < kMaxZones);
|
|
439
|
+
const size_t line = zone_idx / kPerLine;
|
|
440
|
+
const size_t offset = zone_idx % kPerLine;
|
|
441
|
+
return zones_[(line * kMaxThreads + thread) * kPerLine + offset];
|
|
359
442
|
}
|
|
360
443
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
uint64_t child_overhead_ = 0;
|
|
364
|
-
|
|
365
|
-
size_t depth_ = 0; // Number of active zones.
|
|
366
|
-
size_t num_zones_ = 0; // Number of retired zones.
|
|
367
|
-
|
|
368
|
-
alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack
|
|
369
|
-
alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list
|
|
444
|
+
private:
|
|
445
|
+
Accumulator zones_[kMaxZones * kMaxThreads];
|
|
370
446
|
};
|
|
371
447
|
|
|
372
|
-
//
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
448
|
+
// Reacts to zone enter/exit events. Builds a stack of active zones and
|
|
449
|
+
// accumulates self/child duration for each.
|
|
450
|
+
class PerThread {
|
|
376
451
|
public:
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
// within the allotted space. If not, UpdateOrAdd is likely to overrun
|
|
385
|
-
// zones_[]. Checking here on the cold path (only reached once per thread)
|
|
386
|
-
// is cheap, but it only covers one zone.
|
|
387
|
-
const size_t biased_offset = name - string_origin_;
|
|
388
|
-
HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
|
|
452
|
+
template <typename T>
|
|
453
|
+
static T ClampedSubtract(const T minuend, const T subtrahend) {
|
|
454
|
+
static_assert(IsUnsigned<T>(), "");
|
|
455
|
+
const T difference = minuend - subtrahend;
|
|
456
|
+
// Clang output for this is verified to CMOV rather than branch.
|
|
457
|
+
const T no_underflow = (subtrahend > minuend) ? T{0} : ~T{0};
|
|
458
|
+
return difference & no_underflow;
|
|
389
459
|
}
|
|
390
460
|
|
|
391
|
-
|
|
392
|
-
void ComputeOverhead();
|
|
461
|
+
void SetOverheads(const Overheads& overheads) { overheads_ = overheads; }
|
|
393
462
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
463
|
+
// Entering a zone: push onto stack.
|
|
464
|
+
void Enter(const uint64_t t_enter) {
|
|
465
|
+
const size_t depth = depth_;
|
|
466
|
+
HWY_DASSERT(depth < kMaxDepth);
|
|
467
|
+
t_enter_[depth] = t_enter;
|
|
468
|
+
child_total_[1 + depth] = 0;
|
|
469
|
+
depth_ = 1 + depth;
|
|
470
|
+
HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { any_ = 1; }
|
|
397
471
|
}
|
|
398
472
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
473
|
+
// Exiting the most recently entered zone (top of stack).
|
|
474
|
+
void Exit(const uint64_t t_exit, const size_t thread, const ZoneHandle zone,
|
|
475
|
+
Accumulators& accumulators) {
|
|
476
|
+
HWY_DASSERT(depth_ > 0);
|
|
477
|
+
const size_t depth = depth_ - 1;
|
|
478
|
+
const size_t zone_idx = zone.ZoneIdx();
|
|
479
|
+
const uint64_t duration = t_exit - t_enter_[depth];
|
|
480
|
+
// Clang output for this is verified not to branch. This is 0 if inclusive,
|
|
481
|
+
// otherwise the child total.
|
|
482
|
+
const uint64_t child_total =
|
|
483
|
+
child_total_[1 + depth] & zone.ChildTotalMask();
|
|
484
|
+
|
|
485
|
+
const uint64_t self_duration = ClampedSubtract(
|
|
486
|
+
duration, overheads_.self + overheads_.child + child_total);
|
|
487
|
+
accumulators.Get(thread, zone_idx).Add(zone, self_duration);
|
|
488
|
+
// For faster Assimilate() - not all zones are encountered.
|
|
489
|
+
visited_zones_.Set(zone_idx);
|
|
490
|
+
|
|
491
|
+
// Adding this nested time to the parent's `child_total` will
|
|
492
|
+
// cause it to be later subtracted from the parent's `self_duration`.
|
|
493
|
+
child_total_[1 + depth - 1] += duration + overheads_.child;
|
|
494
|
+
|
|
495
|
+
depth_ = depth;
|
|
402
496
|
}
|
|
403
497
|
|
|
404
|
-
|
|
405
|
-
// Ensures prior weakly-ordered streaming stores are globally visible.
|
|
406
|
-
FlushStream();
|
|
498
|
+
bool HadAnyZones() const { return HWY_IS_DEBUG_BUILD ? (any_ != 0) : false; }
|
|
407
499
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
}
|
|
413
|
-
CopyBytes(buffer_, packets_.get() + num_packets_,
|
|
414
|
-
buffer_size_ * sizeof(Packet));
|
|
415
|
-
num_packets_ += buffer_size_;
|
|
500
|
+
// Returns the duration of one enter/exit pair and resets all state. Called
|
|
501
|
+
// via `DetectSelfOverhead`.
|
|
502
|
+
uint64_t GetFirstDurationAndReset(size_t thread, Accumulators& accumulators) {
|
|
503
|
+
HWY_DASSERT(depth_ == 0);
|
|
416
504
|
|
|
417
|
-
|
|
418
|
-
|
|
505
|
+
HWY_DASSERT(visited_zones_.Count() == 1);
|
|
506
|
+
const size_t zone_idx = visited_zones_.First();
|
|
507
|
+
HWY_DASSERT(zone_idx <= 3);
|
|
508
|
+
HWY_DASSERT(visited_zones_.Get(zone_idx));
|
|
509
|
+
visited_zones_.Clear(zone_idx);
|
|
510
|
+
|
|
511
|
+
Accumulator& zone = accumulators.Get(thread, zone_idx);
|
|
512
|
+
const uint64_t duration = zone.duration;
|
|
513
|
+
zone = Accumulator();
|
|
514
|
+
return duration;
|
|
419
515
|
}
|
|
420
516
|
|
|
421
|
-
|
|
517
|
+
// Adds all data to `results` and resets it here. Called from the main thread.
|
|
518
|
+
void MoveTo(const size_t thread, Accumulators& accumulators,
|
|
519
|
+
Results& results) {
|
|
520
|
+
const uint64_t t0 = timer::Start();
|
|
521
|
+
|
|
522
|
+
visited_zones_.Foreach([&](size_t zone_idx) {
|
|
523
|
+
results.Assimilate(thread, zone_idx, accumulators.Get(thread, zone_idx));
|
|
524
|
+
});
|
|
525
|
+
// OK to reset even if we have active zones, because we set `visited_zones_`
|
|
526
|
+
// when exiting the zone.
|
|
527
|
+
visited_zones_ = ZoneSet();
|
|
528
|
+
|
|
529
|
+
results.AddAnalysisTime(t0);
|
|
530
|
+
}
|
|
422
531
|
|
|
423
532
|
private:
|
|
424
|
-
//
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
// runtime by about 3%. Casting is safe because the first member is u64.
|
|
435
|
-
StreamCacheLine(
|
|
436
|
-
reinterpret_cast<const uint64_t*>(buffer_),
|
|
437
|
-
reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
|
|
438
|
-
num_packets_ += kBufferCapacity;
|
|
439
|
-
buffer_size_ = 0;
|
|
440
|
-
}
|
|
441
|
-
buffer_[buffer_size_] = packet;
|
|
442
|
-
++buffer_size_;
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
// Write-combining buffer to avoid cache pollution. Must be the first
|
|
446
|
-
// non-static member to ensure cache-line alignment.
|
|
447
|
-
Packet buffer_[kBufferCapacity];
|
|
448
|
-
size_t buffer_size_ = 0;
|
|
449
|
-
|
|
450
|
-
const size_t max_packets_;
|
|
451
|
-
// Contiguous storage for zone enter/exit packets.
|
|
452
|
-
AlignedFreeUniquePtr<Packet[]> packets_;
|
|
453
|
-
size_t num_packets_;
|
|
454
|
-
// Cached here because we already read this cache line on zone entry/exit.
|
|
455
|
-
const char* HWY_RESTRICT string_origin_;
|
|
456
|
-
Results results_;
|
|
533
|
+
// 40 bytes:
|
|
534
|
+
ZoneSet visited_zones_; // Which `zones_` have been active on this thread.
|
|
535
|
+
uint64_t depth_ = 0; // Current nesting level for active zones.
|
|
536
|
+
uint64_t any_ = 0;
|
|
537
|
+
Overheads overheads_;
|
|
538
|
+
|
|
539
|
+
uint64_t t_enter_[kMaxDepth];
|
|
540
|
+
// Used to deduct child duration from parent's self time (unless inclusive).
|
|
541
|
+
// Shifting by one avoids bounds-checks for depth_ = 0 (root zone).
|
|
542
|
+
uint64_t child_total_[1 + kMaxDepth] = {0};
|
|
457
543
|
};
|
|
458
544
|
|
|
459
|
-
|
|
545
|
+
// Enables shift rather than multiplication.
|
|
546
|
+
static_assert(sizeof(PerThread) == 256, "Wrong size");
|
|
547
|
+
|
|
548
|
+
} // namespace profiler
|
|
549
|
+
|
|
550
|
+
class Profiler {
|
|
460
551
|
public:
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
552
|
+
static HWY_DLLEXPORT Profiler& Get();
|
|
553
|
+
|
|
554
|
+
// Assigns the next counter value to the `thread_local` that `Thread` reads.
|
|
555
|
+
// Must be called exactly once on each thread before any `PROFILER_ZONE`
|
|
556
|
+
// (without a thread argument) are re-entered by multiple threads.
|
|
557
|
+
// `Profiler()` takes care of calling this for the main thread. It is fine not
|
|
558
|
+
// to call it for other threads as long as they only use `PROFILER_ZONE2` or
|
|
559
|
+
// `PROFILER_ZONE3`, which take a thread argument and do not call `Thread`.
|
|
560
|
+
static void InitThread() { s_thread = s_num_threads.fetch_add(1); }
|
|
561
|
+
|
|
562
|
+
// Used by `PROFILER_ZONE/PROFILER_FUNC` to read the `thread` argument from
|
|
563
|
+
// thread_local storage. It is faster to instead pass the ThreadPool `thread`
|
|
564
|
+
// argument to `PROFILER_ZONE2/PROFILER_ZONE3`. Note that the main thread
|
|
565
|
+
// calls `InitThread` first, hence its `Thread` returns zero, which matches
|
|
566
|
+
// the main-first worker numbering used by `ThreadPool`.
|
|
567
|
+
static size_t Thread() { return s_thread; }
|
|
568
|
+
|
|
569
|
+
// Speeds up `UpdateResults` by providing an upper bound on the number of
|
|
570
|
+
// threads tighter than `profiler::kMaxThreads`. It is not required to be
|
|
571
|
+
// tight, and threads less than this can still be unused.
|
|
572
|
+
void SetMaxThreads(size_t max_threads) {
|
|
573
|
+
HWY_ASSERT(max_threads <= profiler::kMaxThreads);
|
|
574
|
+
max_threads_ = max_threads;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
const char* Name(profiler::ZoneHandle zone) const { return names_.Get(zone); }
|
|
578
|
+
|
|
579
|
+
// Copies `name` into the string table and returns its unique `zone`. Uses
|
|
580
|
+
// linear search, which is fine because this is called during static init.
|
|
581
|
+
// Called via static initializer and the result is passed to the `Zone` ctor.
|
|
582
|
+
profiler::ZoneHandle AddZone(const char* name,
|
|
583
|
+
ProfilerFlags flags = ProfilerFlags::kDefault) {
|
|
584
|
+
return names_.AddZone(name, flags);
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
// For reporting average concurrency. Called by `ThreadPool::Run` on the main
|
|
588
|
+
// thread, returns true if this is the first call since the last `EndRootRun`.
|
|
589
|
+
//
|
|
590
|
+
// We want to report the concurrency of each separate 'invocation' of a zone.
|
|
591
|
+
// A unique per-call identifier (could be approximated with the line number
|
|
592
|
+
// and return address) is not sufficient because the caller may in turn be
|
|
593
|
+
// called from differing parallel sections. A per-`ThreadPool::Run` counter
|
|
594
|
+
// also under-reports concurrency because each pool in nested parallelism
|
|
595
|
+
// (over packages and CCXes) would be considered separate invocations.
|
|
596
|
+
//
|
|
597
|
+
// The alternative of detecting overlapping zones via timestamps is not 100%
|
|
598
|
+
// reliable because timers may not be synchronized across sockets or perhaps
|
|
599
|
+
// even cores. "Invariant" x86 TSCs are indeed synchronized across cores, but
|
|
600
|
+
// not across sockets unless the RESET# signal reaches each at the same time.
|
|
601
|
+
// Linux seems to make an effort to correct this, and Arm's "generic timer"
|
|
602
|
+
// broadcasts to "all cores", but there is no universal guarantee.
|
|
603
|
+
//
|
|
604
|
+
// Under the assumption that all concurrency is via our `ThreadPool`, we can
|
|
605
|
+
// record all `thread` for each outermost (root) `ThreadPool::Run`. This
|
|
606
|
+
// collapses all nested pools into one 'invocation'. We then compute per-zone
|
|
607
|
+
// concurrency as the number of unique `thread` seen per invocation.
|
|
608
|
+
bool IsRootRun() {
|
|
609
|
+
// We are not the root if a Run was already active.
|
|
610
|
+
return !run_active_.test_and_set(std::memory_order_acquire);
|
|
611
|
+
}
|
|
465
612
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
613
|
+
// Must be called if `IsRootRun` returned true. Resets the state so that the
|
|
614
|
+
// next call to `IsRootRun` will again return true. Called from main thread.
|
|
615
|
+
// Note that some zones may still be active. Their concurrency will be updated
|
|
616
|
+
// when `PrintResults` is called.
|
|
617
|
+
void EndRootRun() {
|
|
618
|
+
UpdateResults();
|
|
619
|
+
results_.CountThreadsAndReset();
|
|
620
|
+
|
|
621
|
+
run_active_.clear(std::memory_order_release);
|
|
469
622
|
}
|
|
470
623
|
|
|
471
|
-
//
|
|
624
|
+
// Prints results. Call from main thread after all threads have exited all
|
|
625
|
+
// zones. Resets all state, can be called again after more zones.
|
|
472
626
|
void PrintResults() {
|
|
473
|
-
|
|
474
|
-
|
|
627
|
+
UpdateResults();
|
|
628
|
+
// `CountThreadsAndReset` is fused into `Print`, so do not call it here.
|
|
475
629
|
|
|
476
|
-
|
|
477
|
-
|
|
630
|
+
results_.Print(names_);
|
|
631
|
+
}
|
|
478
632
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
633
|
+
// Only for use by Zone; called from any thread.
|
|
634
|
+
profiler::PerThread& GetThread(size_t thread) {
|
|
635
|
+
HWY_DASSERT(thread < profiler::kMaxThreads);
|
|
636
|
+
return threads_[thread];
|
|
637
|
+
}
|
|
638
|
+
profiler::Accumulators& Accumulators() { return accumulators_; }
|
|
639
|
+
|
|
640
|
+
private:
|
|
641
|
+
// Sets main thread index, computes self-overhead, and checks timer support.
|
|
642
|
+
Profiler();
|
|
643
|
+
|
|
644
|
+
// Called from the main thread.
|
|
645
|
+
void UpdateResults() {
|
|
646
|
+
for (size_t thread = 0; thread < max_threads_; ++thread) {
|
|
647
|
+
threads_[thread].MoveTo(thread, accumulators_, results_);
|
|
483
648
|
}
|
|
484
649
|
|
|
485
|
-
|
|
486
|
-
|
|
650
|
+
// Check that all other threads did not have any zones.
|
|
651
|
+
HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) {
|
|
652
|
+
for (size_t thread = max_threads_; thread < profiler::kMaxThreads;
|
|
653
|
+
++thread) {
|
|
654
|
+
HWY_ASSERT(!threads_[thread].HadAnyZones());
|
|
655
|
+
}
|
|
487
656
|
}
|
|
488
657
|
}
|
|
489
658
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
659
|
+
static thread_local size_t s_thread;
|
|
660
|
+
static std::atomic<size_t> s_num_threads;
|
|
661
|
+
size_t max_threads_ = profiler::kMaxThreads;
|
|
662
|
+
|
|
663
|
+
std::atomic_flag run_active_ = ATOMIC_FLAG_INIT;
|
|
664
|
+
|
|
665
|
+
// To avoid locking, each thread has its own working set. We could access this
|
|
666
|
+
// through `thread_local` pointers, but that is slow to read on x86. Because
|
|
667
|
+
// our `ThreadPool` anyway passes a `thread` argument, we can instead pass
|
|
668
|
+
// that through the `PROFILER_ZONE2/PROFILER_ZONE3` macros.
|
|
669
|
+
profiler::PerThread threads_[profiler::kMaxThreads];
|
|
670
|
+
|
|
671
|
+
profiler::Accumulators accumulators_;
|
|
672
|
+
|
|
673
|
+
// Updated by the main thread after the root `ThreadPool::Run` and during
|
|
674
|
+
// `PrintResults`.
|
|
675
|
+
profiler::ConcurrencyStats concurrency_[profiler::kMaxZones];
|
|
676
|
+
|
|
677
|
+
profiler::Names names_;
|
|
678
|
+
|
|
679
|
+
profiler::Results results_;
|
|
494
680
|
};
|
|
495
681
|
|
|
496
|
-
|
|
497
|
-
|
|
682
|
+
namespace profiler {
|
|
683
|
+
|
|
684
|
+
// RAII for zone entry/exit.
|
|
498
685
|
class Zone {
|
|
499
686
|
public:
|
|
500
|
-
//
|
|
501
|
-
|
|
687
|
+
// Thread-compatible; must not be called concurrently with the same `thread`.
|
|
688
|
+
// `thread` must be < `HWY_MIN(kMaxThreads, max_threads_)`, and is typically:
|
|
689
|
+
// - passed from `ThreadPool` via `PROFILER_ZONE2/PROFILER_ZONE3`. NOTE:
|
|
690
|
+
// this value must be unique across all pools, which requires an offset to
|
|
691
|
+
// a nested pool's `thread` argument.
|
|
692
|
+
// - obtained from `Profiler::Thread()`, or
|
|
693
|
+
// - 0 if only a single thread is active.
|
|
694
|
+
Zone(Profiler& profiler, size_t thread, ZoneHandle zone)
|
|
695
|
+
: profiler_(profiler) {
|
|
502
696
|
HWY_FENCE;
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
thread_specific = StaticThreadSpecific() = Threads().Add(name);
|
|
512
|
-
// Must happen after setting StaticThreadSpecific, because ComputeOverhead
|
|
513
|
-
// also calls Zone().
|
|
514
|
-
thread_specific->ComputeOverhead();
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
// (Capture timestamp ASAP, not inside WriteEntry.)
|
|
697
|
+
const uint64_t t_enter = timer::Start();
|
|
698
|
+
HWY_FENCE;
|
|
699
|
+
thread_ = static_cast<uint32_t>(thread);
|
|
700
|
+
zone_ = zone;
|
|
701
|
+
profiler.GetThread(thread).Enter(t_enter);
|
|
518
702
|
HWY_FENCE;
|
|
519
|
-
const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
|
|
520
|
-
thread_specific->WriteEntry(name, timestamp);
|
|
521
703
|
}
|
|
522
704
|
|
|
523
|
-
|
|
705
|
+
~Zone() {
|
|
524
706
|
HWY_FENCE;
|
|
525
|
-
const uint64_t
|
|
526
|
-
|
|
707
|
+
const uint64_t t_exit = timer::Stop();
|
|
708
|
+
profiler_.GetThread(thread_).Exit(t_exit, thread_, zone_,
|
|
709
|
+
profiler_.Accumulators());
|
|
527
710
|
HWY_FENCE;
|
|
528
711
|
}
|
|
529
712
|
|
|
530
|
-
// Call exactly once after all threads have exited all zones.
|
|
531
|
-
static void PrintResults() { Threads().PrintResults(); }
|
|
532
|
-
|
|
533
713
|
private:
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
static thread_local ThreadSpecific* thread_specific;
|
|
538
|
-
return thread_specific;
|
|
539
|
-
}
|
|
540
|
-
|
|
541
|
-
// Returns the singleton ThreadList. Non time-critical.
|
|
542
|
-
static ThreadList& Threads() {
|
|
543
|
-
static ThreadList threads_;
|
|
544
|
-
return threads_;
|
|
545
|
-
}
|
|
714
|
+
Profiler& profiler_;
|
|
715
|
+
uint32_t thread_;
|
|
716
|
+
ZoneHandle zone_;
|
|
546
717
|
};
|
|
547
718
|
|
|
548
|
-
//
|
|
549
|
-
//
|
|
550
|
-
// "name" must be a string literal, which is ensured by merging with "".
|
|
551
|
-
#define PROFILER_ZONE(name) \
|
|
552
|
-
HWY_FENCE; \
|
|
553
|
-
const hwy::Zone zone("" name); \
|
|
554
|
-
HWY_FENCE
|
|
719
|
+
} // namespace profiler
|
|
720
|
+
#else // profiler disabled: stub implementation
|
|
555
721
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
HWY_FENCE; \
|
|
560
|
-
const hwy::Zone zone(__func__); \
|
|
561
|
-
HWY_FENCE
|
|
722
|
+
namespace profiler {
|
|
723
|
+
struct ZoneHandle {};
|
|
724
|
+
} // namespace profiler
|
|
562
725
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
|
|
575
|
-
const size_t kNumDurations = 1024;
|
|
576
|
-
uint32_t durations[kNumDurations];
|
|
577
|
-
|
|
578
|
-
for (size_t idx_duration = 0; idx_duration < kNumDurations;
|
|
579
|
-
++idx_duration) {
|
|
580
|
-
{
|
|
581
|
-
PROFILER_ZONE("Dummy Zone (never shown)");
|
|
582
|
-
}
|
|
583
|
-
const uint64_t duration = results_.ZoneDuration(buffer_);
|
|
584
|
-
buffer_size_ = 0;
|
|
585
|
-
durations[idx_duration] = static_cast<uint32_t>(duration);
|
|
586
|
-
HWY_DASSERT(num_packets_ == 0);
|
|
587
|
-
}
|
|
588
|
-
robust_statistics::CountingSort(durations, kNumDurations);
|
|
589
|
-
samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
|
|
590
|
-
}
|
|
591
|
-
// Median.
|
|
592
|
-
robust_statistics::CountingSort(samples, kNumSamples);
|
|
593
|
-
self_overhead = samples[kNumSamples / 2];
|
|
594
|
-
if (PROFILER_PRINT_OVERHEAD) {
|
|
595
|
-
printf("Overhead: %zu\n", self_overhead);
|
|
596
|
-
}
|
|
597
|
-
results_.SetSelfOverhead(self_overhead);
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
// Delay before capturing start timestamp / after end timestamp.
|
|
601
|
-
const size_t kNumSamples = 32;
|
|
602
|
-
uint32_t samples[kNumSamples];
|
|
603
|
-
for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
|
|
604
|
-
const size_t kNumDurations = 16;
|
|
605
|
-
uint32_t durations[kNumDurations];
|
|
606
|
-
for (size_t idx_duration = 0; idx_duration < kNumDurations;
|
|
607
|
-
++idx_duration) {
|
|
608
|
-
const size_t kReps = 10000;
|
|
609
|
-
// Analysis time should not be included => must fit within buffer.
|
|
610
|
-
HWY_DASSERT(kReps * 2 < max_packets_);
|
|
611
|
-
std::atomic_thread_fence(std::memory_order_seq_cst);
|
|
612
|
-
const uint64_t t0 = hn::timer::Start();
|
|
613
|
-
for (size_t i = 0; i < kReps; ++i) {
|
|
614
|
-
PROFILER_ZONE("Dummy");
|
|
615
|
-
}
|
|
616
|
-
FlushStream();
|
|
617
|
-
const uint64_t t1 = hn::timer::Stop();
|
|
618
|
-
HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
|
|
619
|
-
buffer_size_ = 0;
|
|
620
|
-
num_packets_ = 0;
|
|
621
|
-
const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
|
|
622
|
-
durations[idx_duration] =
|
|
623
|
-
static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
|
|
624
|
-
}
|
|
625
|
-
robust_statistics::CountingSort(durations, kNumDurations);
|
|
626
|
-
samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
|
|
627
|
-
}
|
|
628
|
-
robust_statistics::CountingSort(samples, kNumSamples);
|
|
629
|
-
const uint64_t child_overhead = samples[9 * kNumSamples / 10];
|
|
630
|
-
if (PROFILER_PRINT_OVERHEAD) {
|
|
631
|
-
printf("Child overhead: %zu\n", child_overhead);
|
|
726
|
+
struct Profiler {
|
|
727
|
+
static HWY_DLLEXPORT Profiler& Get();
|
|
728
|
+
|
|
729
|
+
static void InitThread() {}
|
|
730
|
+
static size_t Thread() { return 0; }
|
|
731
|
+
void SetMaxThreads(size_t) {}
|
|
732
|
+
|
|
733
|
+
const char* Name(profiler::ZoneHandle) const { return nullptr; }
|
|
734
|
+
profiler::ZoneHandle AddZone(const char*,
|
|
735
|
+
ProfilerFlags = ProfilerFlags::kDefault) {
|
|
736
|
+
return profiler::ZoneHandle();
|
|
632
737
|
}
|
|
633
|
-
results_.SetChildOverhead(child_overhead);
|
|
634
|
-
}
|
|
635
738
|
|
|
636
|
-
|
|
739
|
+
bool IsRootRun() { return false; }
|
|
740
|
+
void EndRootRun() {}
|
|
637
741
|
|
|
638
|
-
|
|
742
|
+
void PrintResults() {}
|
|
743
|
+
};
|
|
744
|
+
|
|
745
|
+
namespace profiler {
|
|
746
|
+
struct Zone {
|
|
747
|
+
Zone(Profiler&, size_t, ZoneHandle) {}
|
|
748
|
+
};
|
|
639
749
|
|
|
750
|
+
} // namespace profiler
|
|
640
751
|
#endif // PROFILER_ENABLED || HWY_IDE
|
|
641
752
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
753
|
+
} // namespace hwy
|
|
754
|
+
|
|
755
|
+
// Creates a `Zone` lvalue with a line-dependent name, which records the elapsed
|
|
756
|
+
// time from here until the end of the current scope. `p` is from
|
|
757
|
+
// `Profiler::Get()` or a cached reference. `thread` is < `kMaxThreads`. `zone`
|
|
758
|
+
// is the return value of `AddZone`. Separating its static init from the `Zone`
|
|
759
|
+
// may be more efficient than `PROFILER_ZONE2`.
|
|
760
|
+
#define PROFILER_ZONE3(p, thread, zone) \
|
|
761
|
+
HWY_FENCE; \
|
|
762
|
+
const hwy::profiler::Zone HWY_CONCAT(Z, __LINE__)(p, thread, zone); \
|
|
763
|
+
HWY_FENCE
|
|
764
|
+
|
|
765
|
+
// For compatibility with old callers that do not pass `p` nor `flags`.
|
|
766
|
+
// Also calls AddZone. Usage: `PROFILER_ZONE2(thread, "MyZone");`
|
|
767
|
+
#define PROFILER_ZONE2(thread, name) \
|
|
768
|
+
static const hwy::profiler::ZoneHandle HWY_CONCAT(zone, __LINE__) = \
|
|
769
|
+
hwy::Profiler::Get().AddZone(name); \
|
|
770
|
+
PROFILER_ZONE3(hwy::Profiler::Get(), thread, HWY_CONCAT(zone, __LINE__))
|
|
771
|
+
#define PROFILER_FUNC2(thread) PROFILER_ZONE2(thread, __func__)
|
|
772
|
+
|
|
773
|
+
// OBSOLETE: it is more efficient to pass `thread` from `ThreadPool` to
|
|
774
|
+
// `PROFILER_ZONE2/PROFILER_ZONE3`. Here we get it from thread_local storage.
|
|
775
|
+
#define PROFILER_ZONE(name) PROFILER_ZONE2(hwy::Profiler::Thread(), name)
|
|
776
|
+
#define PROFILER_FUNC PROFILER_FUNC2(hwy::Profiler::Thread())
|
|
777
|
+
|
|
778
|
+
// DEPRECATED: Use `hwy::Profiler::Get()` directly instead.
|
|
779
|
+
#define PROFILER_ADD_ZONE(name) hwy::Profiler::Get().AddZone(name)
|
|
780
|
+
#define PROFILER_IS_ROOT_RUN() hwy::Profiler::Get().IsRootRun()
|
|
781
|
+
#define PROFILER_END_ROOT_RUN() hwy::Profiler::Get().EndRootRun()
|
|
782
|
+
#define PROFILER_PRINT_RESULTS() hwy::Profiler::Get().PrintResults()
|
|
647
783
|
|
|
648
784
|
#endif // HIGHWAY_HWY_PROFILER_H_
|