@img/sharp-libvips-dev 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/aom/aom_encoder.h +3 -3
- package/include/aom/aomcx.h +17 -8
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/fontconfig/fontconfig.h +5 -3
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -7
- package/include/gio-unix-2.0/gio/gunixinputstream.h +0 -5
- package/include/gio-unix-2.0/gio/gunixoutputstream.h +0 -5
- package/include/glib-2.0/gio/gappinfo.h +0 -7
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/gapplicationcommandline.h +12 -1
- package/include/glib-2.0/gio/gasyncinitable.h +0 -7
- package/include/glib-2.0/gio/gasyncresult.h +0 -6
- package/include/glib-2.0/gio/gbufferedinputstream.h +0 -5
- package/include/glib-2.0/gio/gbufferedoutputstream.h +0 -5
- package/include/glib-2.0/gio/gbytesicon.h +0 -5
- package/include/glib-2.0/gio/gcancellable.h +0 -5
- package/include/glib-2.0/gio/gconverter.h +0 -7
- package/include/glib-2.0/gio/gconverterinputstream.h +0 -6
- package/include/glib-2.0/gio/gconverteroutputstream.h +0 -6
- package/include/glib-2.0/gio/gdatagrambased.h +0 -7
- package/include/glib-2.0/gio/gdatainputstream.h +0 -6
- package/include/glib-2.0/gio/gdataoutputstream.h +0 -6
- package/include/glib-2.0/gio/gdbusinterface.h +0 -8
- package/include/glib-2.0/gio/gdbusinterfaceskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusmessage.h +2 -1
- package/include/glib-2.0/gio/gdbusobjectmanagerclient.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectmanagerserver.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectproxy.h +0 -8
- package/include/glib-2.0/gio/gdbusobjectskeleton.h +0 -8
- package/include/glib-2.0/gio/gdbusproxy.h +0 -8
- package/include/glib-2.0/gio/gdebugcontroller.h +0 -8
- package/include/glib-2.0/gio/gdebugcontrollerdbus.h +0 -7
- package/include/glib-2.0/gio/gdtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gemblem.h +0 -5
- package/include/glib-2.0/gio/gemblemedicon.h +0 -5
- package/include/glib-2.0/gio/gfile.h +0 -10
- package/include/glib-2.0/gio/gfileenumerator.h +0 -5
- package/include/glib-2.0/gio/gfileicon.h +0 -5
- package/include/glib-2.0/gio/gfileinfo.h +0 -5
- package/include/glib-2.0/gio/gfileinputstream.h +0 -8
- package/include/glib-2.0/gio/gfileiostream.h +0 -8
- package/include/glib-2.0/gio/gfilemonitor.h +0 -5
- package/include/glib-2.0/gio/gfilenamecompleter.h +0 -5
- package/include/glib-2.0/gio/gfileoutputstream.h +0 -8
- package/include/glib-2.0/gio/gfilterinputstream.h +0 -5
- package/include/glib-2.0/gio/gfilteroutputstream.h +0 -5
- package/include/glib-2.0/gio/gicon.h +0 -5
- package/include/glib-2.0/gio/ginitable.h +0 -7
- package/include/glib-2.0/gio/ginputstream.h +0 -5
- package/include/glib-2.0/gio/gio-autocleanups.h +4 -0
- package/include/glib-2.0/gio/gio-visibility.h +34 -0
- package/include/glib-2.0/gio/gioenums.h +6 -1
- package/include/glib-2.0/gio/giomodule.h +0 -5
- package/include/glib-2.0/gio/giostream.h +0 -5
- package/include/glib-2.0/gio/giotypes.h +5 -108
- package/include/glib-2.0/gio/gloadableicon.h +0 -6
- package/include/glib-2.0/gio/gmemoryinputstream.h +0 -5
- package/include/glib-2.0/gio/gmemoryoutputstream.h +0 -5
- package/include/glib-2.0/gio/gmountoperation.h +0 -6
- package/include/glib-2.0/gio/gnetworking.h +4 -0
- package/include/glib-2.0/gio/goutputstream.h +0 -9
- package/include/glib-2.0/gio/gpollableinputstream.h +0 -7
- package/include/glib-2.0/gio/gpollableoutputstream.h +0 -7
- package/include/glib-2.0/gio/gproxy.h +0 -7
- package/include/glib-2.0/gio/gproxyaddressenumerator.h +0 -8
- package/include/glib-2.0/gio/gseekable.h +0 -5
- package/include/glib-2.0/gio/gsettingsbackend.h +0 -5
- package/include/glib-2.0/gio/gsimpleactiongroup.h +0 -7
- package/include/glib-2.0/gio/gsimpleasyncresult.h +0 -5
- package/include/glib-2.0/gio/gsimpleproxyresolver.h +0 -5
- package/include/glib-2.0/gio/gsocket.h +13 -0
- package/include/glib-2.0/gio/gsocketaddressenumerator.h +0 -6
- package/include/glib-2.0/gio/gsocketconnectable.h +0 -5
- package/include/glib-2.0/gio/gtask.h +12 -0
- package/include/glib-2.0/gio/gthemedicon.h +0 -5
- package/include/glib-2.0/gio/gtlsserverconnection.h +0 -8
- package/include/glib-2.0/gio/gunixcredentialsmessage.h +0 -8
- package/include/glib-2.0/gio/gvfs.h +0 -5
- package/include/glib-2.0/gio/gvolume.h +2 -2
- package/include/glib-2.0/gio/gvolumemonitor.h +0 -5
- package/include/glib-2.0/girepository/gi-visibility.h +986 -0
- package/include/glib-2.0/girepository/giarginfo.h +100 -0
- package/include/glib-2.0/girepository/gibaseinfo.h +129 -0
- package/include/glib-2.0/girepository/gicallableinfo.h +119 -0
- package/include/glib-2.0/girepository/gicallbackinfo.h +60 -0
- package/include/glib-2.0/girepository/giconstantinfo.h +72 -0
- package/include/glib-2.0/girepository/gienuminfo.h +82 -0
- package/include/glib-2.0/girepository/gifieldinfo.h +84 -0
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +117 -0
- package/include/glib-2.0/girepository/giinterfaceinfo.h +120 -0
- package/include/glib-2.0/girepository/giobjectinfo.h +230 -0
- package/include/glib-2.0/girepository/gipropertyinfo.h +77 -0
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +75 -0
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +247 -0
- package/include/glib-2.0/girepository/girffi.h +129 -0
- package/include/glib-2.0/girepository/gisignalinfo.h +72 -0
- package/include/glib-2.0/girepository/gistructinfo.h +102 -0
- package/include/glib-2.0/girepository/gitypeinfo.h +144 -0
- package/include/glib-2.0/girepository/gitypelib.h +61 -0
- package/include/glib-2.0/girepository/gitypes.h +421 -0
- package/include/glib-2.0/girepository/giunioninfo.h +105 -0
- package/include/glib-2.0/girepository/giunresolvedinfo.h +60 -0
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +88 -0
- package/include/glib-2.0/glib/deprecated/gcompletion.h +1 -1
- package/include/glib-2.0/glib/deprecated/grel.h +0 -23
- package/include/glib-2.0/glib/deprecated/gthread.h +10 -6
- package/include/glib-2.0/glib/gatomic.h +20 -20
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gbookmarkfile.h +39 -1
- package/include/glib-2.0/glib/gchecksum.h +0 -10
- package/include/glib-2.0/glib/gdate.h +0 -9
- package/include/glib-2.0/glib/gdatetime.h +33 -1
- package/include/glib-2.0/glib/gdir.h +5 -0
- package/include/glib-2.0/glib/ghmac.h +0 -9
- package/include/glib-2.0/glib/glib-autocleanups.h +4 -0
- package/include/glib-2.0/glib/glib-visibility.h +34 -0
- package/include/glib-2.0/glib/gmacros.h +1 -0
- package/include/glib-2.0/glib/gmessages.h +11 -0
- package/include/glib-2.0/glib/gpathbuf.h +0 -7
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstdio.h +1 -1
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gstrvbuilder.h +4 -8
- package/include/glib-2.0/glib/gtestutils.h +5 -0
- package/include/glib-2.0/glib/gthread.h +216 -3
- package/include/glib-2.0/glib/gunicode.h +12 -2
- package/include/glib-2.0/glib/gvarianttype.h +1 -10
- package/include/glib-2.0/glib/gversionmacros.h +9 -0
- package/include/glib-2.0/glib/gwin32.h +4 -4
- package/include/glib-2.0/glib-unix.h +214 -0
- package/include/glib-2.0/gmodule/gmodule-visibility.h +34 -0
- package/include/glib-2.0/gobject/gbinding.h +0 -8
- package/include/glib-2.0/gobject/gbindinggroup.h +0 -8
- package/include/glib-2.0/gobject/gclosure.h +1 -9
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +44 -0
- package/include/glib-2.0/gobject/gobject-autocleanups.h +4 -0
- package/include/glib-2.0/gobject/gobject-visibility.h +34 -0
- package/include/glib-2.0/gobject/gobject.h +1 -16
- package/include/glib-2.0/gobject/gparam.h +3 -12
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/glib-2.0/gobject/gsignalgroup.h +0 -8
- package/include/glib-2.0/gobject/gtype.h +53 -20
- package/include/glib-2.0/gobject/gtypemodule.h +0 -7
- package/include/glib-2.0/gobject/gtypeplugin.h +0 -6
- package/include/glib-2.0/gobject/gvaluearray.h +0 -7
- package/include/glib-2.0/gobject/gvaluecollector.h +1 -11
- package/include/glib-2.0/gobject/gvaluetypes.h +2 -0
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/lcms2.h +46 -7
- package/include/lcms2_plugin.h +4 -4
- package/include/libheif/heif_version.h +2 -2
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/HTMLparser.h +23 -0
- package/include/libxml2/libxml/SAX.h +0 -2
- package/include/libxml2/libxml/SAX2.h +0 -2
- package/include/libxml2/libxml/c14n.h +0 -2
- package/include/libxml2/libxml/dict.h +1 -0
- package/include/libxml2/libxml/encoding.h +16 -14
- package/include/libxml2/libxml/entities.h +4 -0
- package/include/libxml2/libxml/globals.h +15 -503
- package/include/libxml2/libxml/hash.h +57 -61
- package/include/libxml2/libxml/nanoftp.h +2 -2
- package/include/libxml2/libxml/parser.h +137 -18
- package/include/libxml2/libxml/parserInternals.h +1 -0
- package/include/libxml2/libxml/relaxng.h +2 -1
- package/include/libxml2/libxml/schemasInternals.h +1 -0
- package/include/libxml2/libxml/schematron.h +1 -0
- package/include/libxml2/libxml/threads.h +4 -11
- package/include/libxml2/libxml/tree.h +68 -20
- package/include/libxml2/libxml/uri.h +2 -1
- package/include/libxml2/libxml/valid.h +2 -0
- package/include/libxml2/libxml/xmlIO.h +65 -13
- package/include/libxml2/libxml/xmlerror.h +37 -8
- package/include/libxml2/libxml/xmlmemory.h +37 -40
- package/include/libxml2/libxml/xmlreader.h +6 -0
- package/include/libxml2/libxml/xmlregexp.h +2 -9
- package/include/libxml2/libxml/xmlsave.h +9 -0
- package/include/libxml2/libxml/xmlschemas.h +3 -0
- package/include/libxml2/libxml/xmlversion.h +28 -43
- package/include/libxml2/libxml/xpath.h +1 -1
- package/include/libxml2/libxml/xpathInternals.h +2 -1
- package/include/libxml2/libxml/xpointer.h +5 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +3 -3
- package/include/pixman-1/pixman.h +9 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/include/zconf.h +3 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +15 -15
package/include/hwy/per_target.h
CHANGED
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#define HIGHWAY_HWY_PER_TARGET_H_
|
|
18
18
|
|
|
19
19
|
#include <stddef.h>
|
|
20
|
+
#include <stdint.h>
|
|
20
21
|
|
|
21
22
|
#include "hwy/highway_export.h"
|
|
22
23
|
|
|
@@ -25,6 +26,9 @@
|
|
|
25
26
|
|
|
26
27
|
namespace hwy {
|
|
27
28
|
|
|
29
|
+
// Returns the HWY_TARGET which HWY_DYNAMIC_DISPATCH selected.
|
|
30
|
+
HWY_DLLEXPORT int64_t DispatchedTarget();
|
|
31
|
+
|
|
28
32
|
// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
|
|
29
33
|
//
|
|
30
34
|
// Do not cache the result, which may change after calling DisableTargets, or
|
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
// Copyright 2017 Google Inc. All Rights Reserved.
|
|
2
|
+
//
|
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
// you may not use this file except in compliance with the License.
|
|
5
|
+
// You may obtain a copy of the License at
|
|
6
|
+
//
|
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
//
|
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
// See the License for the specific language governing permissions and
|
|
13
|
+
// limitations under the License.
|
|
14
|
+
|
|
15
|
+
#ifndef HIGHWAY_HWY_PROFILER_H_
|
|
16
|
+
#define HIGHWAY_HWY_PROFILER_H_
|
|
17
|
+
|
|
18
|
+
// High precision, low overhead time measurements. Returns exact call counts and
|
|
19
|
+
// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
|
|
20
|
+
//
|
|
21
|
+
// Uses RAII to capture begin/end timestamps, with user-specified zone names:
|
|
22
|
+
// { PROFILER_ZONE("name"); /*code*/ } or
|
|
23
|
+
// the name of the current function:
|
|
24
|
+
// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
|
|
25
|
+
//
|
|
26
|
+
// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
|
|
27
|
+
// print call counts and average durations [CPU cycles] to stdout, sorted in
|
|
28
|
+
// descending order of total duration.
|
|
29
|
+
//
|
|
30
|
+
// The binary MUST be built with --dynamic_mode=off because we rely on the data
|
|
31
|
+
// segments being nearby; if not, an assertion will likely fail.
|
|
32
|
+
|
|
33
|
+
#include "hwy/base.h"
|
|
34
|
+
|
|
35
|
+
// Configuration settings:
|
|
36
|
+
|
|
37
|
+
// If zero, this file has no effect and no measurements will be recorded.
|
|
38
|
+
#ifndef PROFILER_ENABLED
|
|
39
|
+
#define PROFILER_ENABLED 0
|
|
40
|
+
#endif
|
|
41
|
+
|
|
42
|
+
// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
|
|
43
|
+
// enters at least one zone. Once this buffer is full, the thread will analyze
|
|
44
|
+
// and discard packets, thus temporarily adding some observer overhead.
|
|
45
|
+
// Each zone occupies 16 bytes.
|
|
46
|
+
#ifndef PROFILER_THREAD_STORAGE
|
|
47
|
+
#define PROFILER_THREAD_STORAGE 200ULL
|
|
48
|
+
#endif
|
|
49
|
+
|
|
50
|
+
#if PROFILER_ENABLED || HWY_IDE
|
|
51
|
+
|
|
52
|
+
#include <stddef.h>
|
|
53
|
+
#include <stdint.h>
|
|
54
|
+
#include <stdio.h>
|
|
55
|
+
#include <string.h> // strcmp
|
|
56
|
+
|
|
57
|
+
#include <algorithm> // std::sort
|
|
58
|
+
#include <atomic>
|
|
59
|
+
|
|
60
|
+
#include "hwy/aligned_allocator.h"
|
|
61
|
+
#include "hwy/cache_control.h" // FlushStream
|
|
62
|
+
// #include "hwy/contrib/sort/vqsort.h"
|
|
63
|
+
#include "hwy/highway.h" // Stream
|
|
64
|
+
#include "hwy/robust_statistics.h"
|
|
65
|
+
#include "hwy/timer-inl.h"
|
|
66
|
+
#include "hwy/timer.h"
|
|
67
|
+
|
|
68
|
+
#define PROFILER_PRINT_OVERHEAD 0
|
|
69
|
+
|
|
70
|
+
namespace hwy {
|
|
71
|
+
|
|
72
|
+
// Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):
|
|
73
|
+
|
|
74
|
+
// How many threads can actually enter a zone (those that don't do not count).
|
|
75
|
+
// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
|
|
76
|
+
// WARNING: a fiber library can spawn hundreds of threads.
|
|
77
|
+
static constexpr size_t kMaxThreads = 256;
|
|
78
|
+
|
|
79
|
+
static constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
|
|
80
|
+
|
|
81
|
+
static constexpr size_t kMaxZones = 256; // Total number of zones.
|
|
82
|
+
|
|
83
|
+
// Overwrites "to" without loading it into the cache (read-for-ownership).
|
|
84
|
+
// Both pointers must be aligned.
|
|
85
|
+
HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
|
|
86
|
+
uint64_t* HWY_RESTRICT to) {
|
|
87
|
+
namespace hn = HWY_NAMESPACE;
|
|
88
|
+
const hn::ScalableTag<uint64_t> d;
|
|
89
|
+
for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {
|
|
90
|
+
hn::Stream(hn::Load(d, from + i), d, to + i);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
#pragma pack(push, 1)
|
|
95
|
+
|
|
96
|
+
// Represents zone entry/exit events. Stores a full-resolution timestamp plus
|
|
97
|
+
// an offset (representing zone name or identifying exit packets). POD.
|
|
98
|
+
class Packet {
|
|
99
|
+
public:
|
|
100
|
+
// If offsets do not fit, UpdateOrAdd will overrun our heap allocation
|
|
101
|
+
// (governed by kMaxZones). We have seen multi-megabyte offsets.
|
|
102
|
+
static constexpr size_t kOffsetBits = 25;
|
|
103
|
+
static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
|
|
104
|
+
|
|
105
|
+
// We need full-resolution timestamps; at an effective rate of 4 GHz,
|
|
106
|
+
// this permits 1 minute zone durations (for longer durations, split into
|
|
107
|
+
// multiple zones). Wraparound is handled by masking.
|
|
108
|
+
static constexpr size_t kTimestampBits = 64 - kOffsetBits;
|
|
109
|
+
static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
|
|
110
|
+
|
|
111
|
+
static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
|
|
112
|
+
HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
|
|
113
|
+
|
|
114
|
+
Packet packet;
|
|
115
|
+
packet.bits_ =
|
|
116
|
+
(biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
|
|
117
|
+
return packet;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
uint64_t Timestamp() const { return bits_ & kTimestampMask; }
|
|
121
|
+
|
|
122
|
+
size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
|
|
123
|
+
|
|
124
|
+
private:
|
|
125
|
+
uint64_t bits_;
|
|
126
|
+
};
|
|
127
|
+
static_assert(sizeof(Packet) == 8, "Wrong Packet size");
|
|
128
|
+
|
|
129
|
+
// Returns the address of a string literal. Assuming zone names are also
|
|
130
|
+
// literals and stored nearby, we can represent them as offsets, which are
|
|
131
|
+
// faster to compute than hashes or even a static index.
|
|
132
|
+
//
|
|
133
|
+
// This function must not be static - each call (even from other translation
|
|
134
|
+
// units) must return the same value.
|
|
135
|
+
inline const char* StringOrigin() {
|
|
136
|
+
// Chosen such that no zone name is a prefix nor suffix of this string
|
|
137
|
+
// to ensure they aren't merged (offset 0 identifies zone-exit packets).
|
|
138
|
+
static const char* string_origin = "__#__";
|
|
139
|
+
return string_origin - Packet::kOffsetBias;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Representation of an active zone, stored in a stack. Used to deduct
|
|
143
|
+
// child duration from the parent's self time. POD.
|
|
144
|
+
struct Node {
|
|
145
|
+
Packet packet;
|
|
146
|
+
uint64_t child_total;
|
|
147
|
+
};
|
|
148
|
+
static_assert(sizeof(Node) == 16, "Wrong Node size");
|
|
149
|
+
|
|
150
|
+
// Holds statistics for all zones with the same name. POD.
|
|
151
|
+
struct Accumulator {
|
|
152
|
+
static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
|
|
153
|
+
|
|
154
|
+
uint64_t BiasedOffset() const { return u128.lo >> kNumCallBits; }
|
|
155
|
+
uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }
|
|
156
|
+
uint64_t Duration() const { return u128.hi; }
|
|
157
|
+
|
|
158
|
+
void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
|
|
159
|
+
u128.hi = duration;
|
|
160
|
+
u128.lo = (biased_offset << kNumCallBits) + num_calls;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
void Add(uint64_t num_calls, uint64_t duration) {
|
|
164
|
+
u128.lo += num_calls;
|
|
165
|
+
u128.hi += duration;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// For fast sorting by duration, which must therefore be the hi element.
|
|
169
|
+
// lo holds BiasedOffset and NumCalls.
|
|
170
|
+
uint128_t u128;
|
|
171
|
+
};
|
|
172
|
+
static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
|
|
173
|
+
|
|
174
|
+
template <typename T>
|
|
175
|
+
inline T ClampedSubtract(const T minuend, const T subtrahend) {
|
|
176
|
+
if (subtrahend > minuend) {
|
|
177
|
+
return 0;
|
|
178
|
+
}
|
|
179
|
+
return minuend - subtrahend;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Per-thread call graph (stack) and Accumulator for each zone.
|
|
183
|
+
class Results {
|
|
184
|
+
public:
|
|
185
|
+
Results() { ZeroBytes(zones_, sizeof(zones_)); }
|
|
186
|
+
|
|
187
|
+
// Used for computing overhead when this thread encounters its first Zone.
|
|
188
|
+
// This has no observable effect apart from increasing "analyze_elapsed_".
|
|
189
|
+
uint64_t ZoneDuration(const Packet* packets) {
|
|
190
|
+
HWY_DASSERT(depth_ == 0);
|
|
191
|
+
HWY_DASSERT(num_zones_ == 0);
|
|
192
|
+
AnalyzePackets(packets, 2);
|
|
193
|
+
const uint64_t duration = zones_[0].Duration();
|
|
194
|
+
zones_[0].Set(0, 0, 0);
|
|
195
|
+
HWY_DASSERT(depth_ == 0);
|
|
196
|
+
num_zones_ = 0;
|
|
197
|
+
return duration;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
void SetSelfOverhead(const uint64_t self_overhead) {
|
|
201
|
+
self_overhead_ = self_overhead;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
void SetChildOverhead(const uint64_t child_overhead) {
|
|
205
|
+
child_overhead_ = child_overhead;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// Draw all required information from the packets, which can be discarded
|
|
209
|
+
// afterwards. Called whenever this thread's storage is full.
|
|
210
|
+
void AnalyzePackets(const Packet* packets, const size_t num_packets) {
|
|
211
|
+
namespace hn = HWY_NAMESPACE;
|
|
212
|
+
const uint64_t t0 = hn::timer::Start();
|
|
213
|
+
|
|
214
|
+
for (size_t i = 0; i < num_packets; ++i) {
|
|
215
|
+
const Packet p = packets[i];
|
|
216
|
+
// Entering a zone
|
|
217
|
+
if (p.BiasedOffset() != Packet::kOffsetBias) {
|
|
218
|
+
HWY_DASSERT(depth_ < kMaxDepth);
|
|
219
|
+
nodes_[depth_].packet = p;
|
|
220
|
+
nodes_[depth_].child_total = 0;
|
|
221
|
+
++depth_;
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
HWY_DASSERT(depth_ != 0);
|
|
226
|
+
const Node& node = nodes_[depth_ - 1];
|
|
227
|
+
// Masking correctly handles unsigned wraparound.
|
|
228
|
+
const uint64_t duration =
|
|
229
|
+
(p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
|
|
230
|
+
const uint64_t self_duration = ClampedSubtract(
|
|
231
|
+
duration, self_overhead_ + child_overhead_ + node.child_total);
|
|
232
|
+
|
|
233
|
+
UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
|
|
234
|
+
--depth_;
|
|
235
|
+
|
|
236
|
+
// Deduct this nested node's time from its parent's self_duration.
|
|
237
|
+
if (depth_ != 0) {
|
|
238
|
+
nodes_[depth_ - 1].child_total += duration + child_overhead_;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const uint64_t t1 = hn::timer::Stop();
|
|
243
|
+
analyze_elapsed_ += t1 - t0;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Incorporates results from another thread. Call after all threads have
|
|
247
|
+
// exited any zones.
|
|
248
|
+
void Assimilate(const Results& other) {
|
|
249
|
+
namespace hn = HWY_NAMESPACE;
|
|
250
|
+
const uint64_t t0 = hn::timer::Start();
|
|
251
|
+
HWY_DASSERT(depth_ == 0);
|
|
252
|
+
HWY_DASSERT(other.depth_ == 0);
|
|
253
|
+
|
|
254
|
+
for (size_t i = 0; i < other.num_zones_; ++i) {
|
|
255
|
+
const Accumulator& zone = other.zones_[i];
|
|
256
|
+
UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
|
|
257
|
+
}
|
|
258
|
+
const uint64_t t1 = hn::timer::Stop();
|
|
259
|
+
analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Single-threaded.
|
|
263
|
+
void Print() {
|
|
264
|
+
namespace hn = HWY_NAMESPACE;
|
|
265
|
+
const uint64_t t0 = hn::timer::Start();
|
|
266
|
+
MergeDuplicates();
|
|
267
|
+
|
|
268
|
+
// Sort by decreasing total (self) cost.
|
|
269
|
+
// VQSort(&zones_[0].u128, num_zones_, SortDescending());
|
|
270
|
+
std::sort(zones_, zones_ + num_zones_,
|
|
271
|
+
[](const Accumulator& r1, const Accumulator& r2) {
|
|
272
|
+
return r1.Duration() > r2.Duration();
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
|
|
276
|
+
|
|
277
|
+
const char* string_origin = StringOrigin();
|
|
278
|
+
for (size_t i = 0; i < num_zones_; ++i) {
|
|
279
|
+
const Accumulator& r = zones_[i];
|
|
280
|
+
const uint64_t num_calls = r.NumCalls();
|
|
281
|
+
printf("%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),
|
|
282
|
+
num_calls, r.Duration() / num_calls,
|
|
283
|
+
static_cast<double>(r.Duration()) * inv_freq);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
const uint64_t t1 = hn::timer::Stop();
|
|
287
|
+
analyze_elapsed_ += t1 - t0;
|
|
288
|
+
printf("Total analysis [s]: %f\n",
|
|
289
|
+
static_cast<double>(analyze_elapsed_) * inv_freq);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
private:
|
|
293
|
+
// Updates an existing Accumulator (uniquely identified by biased_offset) or
|
|
294
|
+
// adds one if this is the first time this thread analyzed that zone.
|
|
295
|
+
// Uses a self-organizing list data structure, which avoids dynamic memory
|
|
296
|
+
// allocations and is far faster than unordered_map. Loads, updates and
|
|
297
|
+
// stores the entire Accumulator with vector instructions.
|
|
298
|
+
void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
|
|
299
|
+
const uint64_t duration) {
|
|
300
|
+
HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
|
|
301
|
+
|
|
302
|
+
// Special case for first zone: (maybe) update, without swapping.
|
|
303
|
+
if (zones_[0].BiasedOffset() == biased_offset) {
|
|
304
|
+
zones_[0].Add(num_calls, duration);
|
|
305
|
+
HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Look for a zone with the same offset.
|
|
310
|
+
for (size_t i = 1; i < num_zones_; ++i) {
|
|
311
|
+
if (zones_[i].BiasedOffset() == biased_offset) {
|
|
312
|
+
zones_[i].Add(num_calls, duration);
|
|
313
|
+
HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
|
|
314
|
+
// Swap with predecessor (more conservative than move to front,
|
|
315
|
+
// but at least as successful).
|
|
316
|
+
const Accumulator prev = zones_[i - 1];
|
|
317
|
+
zones_[i - 1] = zones_[i];
|
|
318
|
+
zones_[i] = prev;
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Not found; create a new Accumulator.
|
|
324
|
+
HWY_DASSERT(num_zones_ < kMaxZones);
|
|
325
|
+
Accumulator* HWY_RESTRICT zone = zones_ + num_zones_;
|
|
326
|
+
zone->Set(biased_offset, num_calls, duration);
|
|
327
|
+
HWY_DASSERT(zone->BiasedOffset() == biased_offset);
|
|
328
|
+
++num_zones_;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Each instantiation of a function template seems to get its own copy of
|
|
332
|
+
// __func__ and GCC doesn't merge them. An N^2 search for duplicates is
|
|
333
|
+
// acceptable because we only expect a few dozen zones.
|
|
334
|
+
void MergeDuplicates() {
|
|
335
|
+
const char* string_origin = StringOrigin();
|
|
336
|
+
for (size_t i = 0; i < num_zones_; ++i) {
|
|
337
|
+
const size_t biased_offset = zones_[i].BiasedOffset();
|
|
338
|
+
const char* name = string_origin + biased_offset;
|
|
339
|
+
// Separate num_calls from biased_offset so we can add them together.
|
|
340
|
+
uint64_t num_calls = zones_[i].NumCalls();
|
|
341
|
+
|
|
342
|
+
// Add any subsequent duplicates to num_calls and total_duration.
|
|
343
|
+
for (size_t j = i + 1; j < num_zones_;) {
|
|
344
|
+
if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
|
|
345
|
+
num_calls += zones_[j].NumCalls();
|
|
346
|
+
zones_[i].Add(0, zones_[j].Duration());
|
|
347
|
+
// Fill hole with last item.
|
|
348
|
+
zones_[j] = zones_[--num_zones_];
|
|
349
|
+
} else { // Name differed, try next Accumulator.
|
|
350
|
+
++j;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));
|
|
355
|
+
|
|
356
|
+
// Re-pack regardless of whether any duplicates were found.
|
|
357
|
+
zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
uint64_t analyze_elapsed_ = 0;
|
|
362
|
+
uint64_t self_overhead_ = 0;
|
|
363
|
+
uint64_t child_overhead_ = 0;
|
|
364
|
+
|
|
365
|
+
size_t depth_ = 0; // Number of active zones.
|
|
366
|
+
size_t num_zones_ = 0; // Number of retired zones.
|
|
367
|
+
|
|
368
|
+
alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack
|
|
369
|
+
alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
// Per-thread packet storage, dynamically allocated.
|
|
373
|
+
class ThreadSpecific {
|
|
374
|
+
static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);
|
|
375
|
+
|
|
376
|
+
public:
|
|
377
|
+
// "name" is used to sanity-check offsets fit in kOffsetBits.
|
|
378
|
+
explicit ThreadSpecific(const char* name)
|
|
379
|
+
: max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),
|
|
380
|
+
packets_(AllocateAligned<Packet>(max_packets_)),
|
|
381
|
+
num_packets_(0),
|
|
382
|
+
string_origin_(StringOrigin()) {
|
|
383
|
+
// Even in optimized builds, verify that this zone's name offset fits
|
|
384
|
+
// within the allotted space. If not, UpdateOrAdd is likely to overrun
|
|
385
|
+
// zones_[]. Checking here on the cold path (only reached once per thread)
|
|
386
|
+
// is cheap, but it only covers one zone.
|
|
387
|
+
const size_t biased_offset = name - string_origin_;
|
|
388
|
+
HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Depends on Zone => defined below.
|
|
392
|
+
void ComputeOverhead();
|
|
393
|
+
|
|
394
|
+
void WriteEntry(const char* name, const uint64_t timestamp) {
|
|
395
|
+
const size_t biased_offset = name - string_origin_;
|
|
396
|
+
Write(Packet::Make(biased_offset, timestamp));
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
void WriteExit(const uint64_t timestamp) {
|
|
400
|
+
const size_t biased_offset = Packet::kOffsetBias;
|
|
401
|
+
Write(Packet::Make(biased_offset, timestamp));
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
void AnalyzeRemainingPackets() {
|
|
405
|
+
// Ensures prior weakly-ordered streaming stores are globally visible.
|
|
406
|
+
FlushStream();
|
|
407
|
+
|
|
408
|
+
// Storage full => empty it.
|
|
409
|
+
if (num_packets_ + buffer_size_ > max_packets_) {
|
|
410
|
+
results_.AnalyzePackets(packets_.get(), num_packets_);
|
|
411
|
+
num_packets_ = 0;
|
|
412
|
+
}
|
|
413
|
+
CopyBytes(buffer_, packets_.get() + num_packets_,
|
|
414
|
+
buffer_size_ * sizeof(Packet));
|
|
415
|
+
num_packets_ += buffer_size_;
|
|
416
|
+
|
|
417
|
+
results_.AnalyzePackets(packets_.get(), num_packets_);
|
|
418
|
+
num_packets_ = 0;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
Results& GetResults() { return results_; }
|
|
422
|
+
|
|
423
|
+
private:
|
|
424
|
+
// Write packet to buffer/storage, emptying them as needed.
|
|
425
|
+
void Write(const Packet packet) {
|
|
426
|
+
// Buffer full => copy to storage.
|
|
427
|
+
if (buffer_size_ == kBufferCapacity) {
|
|
428
|
+
// Storage full => empty it.
|
|
429
|
+
if (num_packets_ + kBufferCapacity > max_packets_) {
|
|
430
|
+
results_.AnalyzePackets(packets_.get(), num_packets_);
|
|
431
|
+
num_packets_ = 0;
|
|
432
|
+
}
|
|
433
|
+
// This buffering halves observer overhead and decreases the overall
|
|
434
|
+
// runtime by about 3%. Casting is safe because the first member is u64.
|
|
435
|
+
StreamCacheLine(
|
|
436
|
+
reinterpret_cast<const uint64_t*>(buffer_),
|
|
437
|
+
reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
|
|
438
|
+
num_packets_ += kBufferCapacity;
|
|
439
|
+
buffer_size_ = 0;
|
|
440
|
+
}
|
|
441
|
+
buffer_[buffer_size_] = packet;
|
|
442
|
+
++buffer_size_;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Write-combining buffer to avoid cache pollution. Must be the first
|
|
446
|
+
// non-static member to ensure cache-line alignment.
|
|
447
|
+
Packet buffer_[kBufferCapacity];
|
|
448
|
+
size_t buffer_size_ = 0;
|
|
449
|
+
|
|
450
|
+
const size_t max_packets_;
|
|
451
|
+
// Contiguous storage for zone enter/exit packets.
|
|
452
|
+
AlignedFreeUniquePtr<Packet[]> packets_;
|
|
453
|
+
size_t num_packets_;
|
|
454
|
+
// Cached here because we already read this cache line on zone entry/exit.
|
|
455
|
+
const char* HWY_RESTRICT string_origin_;
|
|
456
|
+
Results results_;
|
|
457
|
+
};
|
|
458
|
+
|
|
459
|
+
class ThreadList {
|
|
460
|
+
public:
|
|
461
|
+
// Called from any thread.
|
|
462
|
+
ThreadSpecific* Add(const char* name) {
|
|
463
|
+
const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
|
|
464
|
+
HWY_DASSERT(index < kMaxThreads);
|
|
465
|
+
|
|
466
|
+
ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
|
|
467
|
+
threads_[index].store(ts, std::memory_order_release);
|
|
468
|
+
return ts;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// Single-threaded.
|
|
472
|
+
void PrintResults() {
|
|
473
|
+
const auto acq = std::memory_order_acquire;
|
|
474
|
+
const size_t num_threads = num_threads_.load(acq);
|
|
475
|
+
|
|
476
|
+
ThreadSpecific* main = threads_[0].load(acq);
|
|
477
|
+
main->AnalyzeRemainingPackets();
|
|
478
|
+
|
|
479
|
+
for (size_t i = 1; i < num_threads; ++i) {
|
|
480
|
+
ThreadSpecific* ts = threads_[i].load(acq);
|
|
481
|
+
ts->AnalyzeRemainingPackets();
|
|
482
|
+
main->GetResults().Assimilate(ts->GetResults());
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (num_threads != 0) {
|
|
486
|
+
main->GetResults().Print();
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
private:
|
|
491
|
+
// Owning pointers.
|
|
492
|
+
alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
|
|
493
|
+
std::atomic<size_t> num_threads_{0};
|
|
494
|
+
};
|
|
495
|
+
|
|
496
|
+
// RAII zone enter/exit recorder constructed by the ZONE macro; also
|
|
497
|
+
// responsible for initializing ThreadSpecific.
|
|
498
|
+
class Zone {
|
|
499
|
+
public:
|
|
500
|
+
// "name" must be a string literal (see StringOrigin).
|
|
501
|
+
HWY_NOINLINE explicit Zone(const char* name) {
|
|
502
|
+
HWY_FENCE;
|
|
503
|
+
ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();
|
|
504
|
+
if (HWY_UNLIKELY(thread_specific == nullptr)) {
|
|
505
|
+
// Ensure the CPU supports our timer.
|
|
506
|
+
char cpu[100];
|
|
507
|
+
if (!platform::HaveTimerStop(cpu)) {
|
|
508
|
+
HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
thread_specific = StaticThreadSpecific() = Threads().Add(name);
|
|
512
|
+
// Must happen after setting StaticThreadSpecific, because ComputeOverhead
|
|
513
|
+
// also calls Zone().
|
|
514
|
+
thread_specific->ComputeOverhead();
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// (Capture timestamp ASAP, not inside WriteEntry.)
|
|
518
|
+
HWY_FENCE;
|
|
519
|
+
const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
|
|
520
|
+
thread_specific->WriteEntry(name, timestamp);
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
HWY_NOINLINE ~Zone() {
|
|
524
|
+
HWY_FENCE;
|
|
525
|
+
const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();
|
|
526
|
+
StaticThreadSpecific()->WriteExit(timestamp);
|
|
527
|
+
HWY_FENCE;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
// Call exactly once after all threads have exited all zones.
|
|
531
|
+
static void PrintResults() { Threads().PrintResults(); }
|
|
532
|
+
|
|
533
|
+
private:
|
|
534
|
+
// Returns reference to the thread's ThreadSpecific pointer (initially null).
|
|
535
|
+
// Function-local static avoids needing a separate definition.
|
|
536
|
+
static ThreadSpecific*& StaticThreadSpecific() {
|
|
537
|
+
static thread_local ThreadSpecific* thread_specific;
|
|
538
|
+
return thread_specific;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
// Returns the singleton ThreadList. Non time-critical.
|
|
542
|
+
static ThreadList& Threads() {
|
|
543
|
+
static ThreadList threads_;
|
|
544
|
+
return threads_;
|
|
545
|
+
}
|
|
546
|
+
};
|
|
547
|
+
|
|
548
|
+
// Creates a zone starting from here until the end of the current scope.
|
|
549
|
+
// Timestamps will be recorded when entering and exiting the zone.
|
|
550
|
+
// "name" must be a string literal, which is ensured by merging with "".
|
|
551
|
+
#define PROFILER_ZONE(name) \
|
|
552
|
+
HWY_FENCE; \
|
|
553
|
+
const hwy::Zone zone("" name); \
|
|
554
|
+
HWY_FENCE
|
|
555
|
+
|
|
556
|
+
// Creates a zone for an entire function (when placed at its beginning).
|
|
557
|
+
// Shorter/more convenient than ZONE.
|
|
558
|
+
#define PROFILER_FUNC \
|
|
559
|
+
HWY_FENCE; \
|
|
560
|
+
const hwy::Zone zone(__func__); \
|
|
561
|
+
HWY_FENCE
|
|
562
|
+
|
|
563
|
+
#define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults
|
|
564
|
+
|
|
565
|
+
inline void ThreadSpecific::ComputeOverhead() {
|
|
566
|
+
namespace hn = HWY_NAMESPACE;
|
|
567
|
+
// Delay after capturing timestamps before/after the actual zone runs. Even
|
|
568
|
+
// with frequency throttling disabled, this has a multimodal distribution,
|
|
569
|
+
// including 32, 34, 48, 52, 59, 62.
|
|
570
|
+
uint64_t self_overhead;
|
|
571
|
+
{
|
|
572
|
+
const size_t kNumSamples = 32;
|
|
573
|
+
uint32_t samples[kNumSamples];
|
|
574
|
+
for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
|
|
575
|
+
const size_t kNumDurations = 1024;
|
|
576
|
+
uint32_t durations[kNumDurations];
|
|
577
|
+
|
|
578
|
+
for (size_t idx_duration = 0; idx_duration < kNumDurations;
|
|
579
|
+
++idx_duration) {
|
|
580
|
+
{
|
|
581
|
+
PROFILER_ZONE("Dummy Zone (never shown)");
|
|
582
|
+
}
|
|
583
|
+
const uint64_t duration = results_.ZoneDuration(buffer_);
|
|
584
|
+
buffer_size_ = 0;
|
|
585
|
+
durations[idx_duration] = static_cast<uint32_t>(duration);
|
|
586
|
+
HWY_DASSERT(num_packets_ == 0);
|
|
587
|
+
}
|
|
588
|
+
robust_statistics::CountingSort(durations, kNumDurations);
|
|
589
|
+
samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
|
|
590
|
+
}
|
|
591
|
+
// Median.
|
|
592
|
+
robust_statistics::CountingSort(samples, kNumSamples);
|
|
593
|
+
self_overhead = samples[kNumSamples / 2];
|
|
594
|
+
if (PROFILER_PRINT_OVERHEAD) {
|
|
595
|
+
printf("Overhead: %zu\n", self_overhead);
|
|
596
|
+
}
|
|
597
|
+
results_.SetSelfOverhead(self_overhead);
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Delay before capturing start timestamp / after end timestamp.
|
|
601
|
+
const size_t kNumSamples = 32;
|
|
602
|
+
uint32_t samples[kNumSamples];
|
|
603
|
+
for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
|
|
604
|
+
const size_t kNumDurations = 16;
|
|
605
|
+
uint32_t durations[kNumDurations];
|
|
606
|
+
for (size_t idx_duration = 0; idx_duration < kNumDurations;
|
|
607
|
+
++idx_duration) {
|
|
608
|
+
const size_t kReps = 10000;
|
|
609
|
+
// Analysis time should not be included => must fit within buffer.
|
|
610
|
+
HWY_DASSERT(kReps * 2 < max_packets_);
|
|
611
|
+
std::atomic_thread_fence(std::memory_order_seq_cst);
|
|
612
|
+
const uint64_t t0 = hn::timer::Start();
|
|
613
|
+
for (size_t i = 0; i < kReps; ++i) {
|
|
614
|
+
PROFILER_ZONE("Dummy");
|
|
615
|
+
}
|
|
616
|
+
FlushStream();
|
|
617
|
+
const uint64_t t1 = hn::timer::Stop();
|
|
618
|
+
HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
|
|
619
|
+
buffer_size_ = 0;
|
|
620
|
+
num_packets_ = 0;
|
|
621
|
+
const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
|
|
622
|
+
durations[idx_duration] =
|
|
623
|
+
static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
|
|
624
|
+
}
|
|
625
|
+
robust_statistics::CountingSort(durations, kNumDurations);
|
|
626
|
+
samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
|
|
627
|
+
}
|
|
628
|
+
robust_statistics::CountingSort(samples, kNumSamples);
|
|
629
|
+
const uint64_t child_overhead = samples[9 * kNumSamples / 10];
|
|
630
|
+
if (PROFILER_PRINT_OVERHEAD) {
|
|
631
|
+
printf("Child overhead: %zu\n", child_overhead);
|
|
632
|
+
}
|
|
633
|
+
results_.SetChildOverhead(child_overhead);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
#pragma pack(pop)
|
|
637
|
+
|
|
638
|
+
} // namespace hwy
|
|
639
|
+
|
|
640
|
+
#endif // PROFILER_ENABLED || HWY_IDE
|
|
641
|
+
|
|
642
|
+
#if !PROFILER_ENABLED && !HWY_IDE
|
|
643
|
+
#define PROFILER_ZONE(name)
|
|
644
|
+
#define PROFILER_FUNC
|
|
645
|
+
#define PROFILER_PRINT_RESULTS()
|
|
646
|
+
#endif
|
|
647
|
+
|
|
648
|
+
#endif // HIGHWAY_HWY_PROFILER_H_
|
|
@@ -135,8 +135,8 @@ T MedianAbsoluteDeviation(const T* values, const size_t num_values,
|
|
|
135
135
|
std::vector<T> abs_deviations;
|
|
136
136
|
abs_deviations.reserve(num_values);
|
|
137
137
|
for (size_t i = 0; i < num_values; ++i) {
|
|
138
|
-
const int64_t abs =
|
|
139
|
-
|
|
138
|
+
const int64_t abs = ScalarAbs(static_cast<int64_t>(values[i]) -
|
|
139
|
+
static_cast<int64_t>(median));
|
|
140
140
|
abs_deviations.push_back(static_cast<T>(abs));
|
|
141
141
|
}
|
|
142
142
|
return Median(abs_deviations.data(), num_values);
|