halide 19.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- halide/__init__.py +39 -0
- halide/_generator_helpers.py +835 -0
- halide/bin/Halide.dll +0 -0
- halide/bin/adams2019_retrain_cost_model.exe +0 -0
- halide/bin/adams2019_weightsdir_to_weightsfile.exe +0 -0
- halide/bin/anderson2021_retrain_cost_model.exe +0 -0
- halide/bin/anderson2021_weightsdir_to_weightsfile.exe +0 -0
- halide/bin/featurization_to_sample.exe +0 -0
- halide/bin/gengen.exe +0 -0
- halide/bin/get_host_target.exe +0 -0
- halide/halide_.cp313-win_amd64.pyd +0 -0
- halide/imageio.py +60 -0
- halide/include/Halide.h +35293 -0
- halide/include/HalideBuffer.h +2618 -0
- halide/include/HalidePyTorchCudaHelpers.h +64 -0
- halide/include/HalidePyTorchHelpers.h +120 -0
- halide/include/HalideRuntime.h +2221 -0
- halide/include/HalideRuntimeCuda.h +89 -0
- halide/include/HalideRuntimeD3D12Compute.h +91 -0
- halide/include/HalideRuntimeHexagonDma.h +104 -0
- halide/include/HalideRuntimeHexagonHost.h +157 -0
- halide/include/HalideRuntimeMetal.h +112 -0
- halide/include/HalideRuntimeOpenCL.h +119 -0
- halide/include/HalideRuntimeQurt.h +32 -0
- halide/include/HalideRuntimeVulkan.h +137 -0
- halide/include/HalideRuntimeWebGPU.h +44 -0
- halide/lib/Halide.lib +0 -0
- halide/lib/HalidePyStubs.lib +0 -0
- halide/lib/Halide_GenGen.lib +0 -0
- halide/lib/autoschedule_adams2019.dll +0 -0
- halide/lib/autoschedule_anderson2021.dll +0 -0
- halide/lib/autoschedule_li2018.dll +0 -0
- halide/lib/autoschedule_mullapudi2016.dll +0 -0
- halide/lib/cmake/Halide/FindHalide_LLVM.cmake +152 -0
- halide/lib/cmake/Halide/FindV8.cmake +33 -0
- halide/lib/cmake/Halide/Halide-shared-deps.cmake +0 -0
- halide/lib/cmake/Halide/Halide-shared-targets-release.cmake +29 -0
- halide/lib/cmake/Halide/Halide-shared-targets.cmake +154 -0
- halide/lib/cmake/Halide/HalideConfig.cmake +162 -0
- halide/lib/cmake/Halide/HalideConfigVersion.cmake +65 -0
- halide/lib/cmake/HalideHelpers/FindHalide_WebGPU.cmake +27 -0
- halide/lib/cmake/HalideHelpers/Halide-Interfaces-release.cmake +112 -0
- halide/lib/cmake/HalideHelpers/Halide-Interfaces.cmake +236 -0
- halide/lib/cmake/HalideHelpers/HalideGeneratorHelpers.cmake +1056 -0
- halide/lib/cmake/HalideHelpers/HalideHelpersConfig.cmake +28 -0
- halide/lib/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
- halide/lib/cmake/HalideHelpers/HalideTargetHelpers.cmake +99 -0
- halide/lib/cmake/HalideHelpers/MutexCopy.ps1 +31 -0
- halide/lib/cmake/HalideHelpers/TargetExportScript.cmake +55 -0
- halide/lib/cmake/Halide_Python/Halide_Python-targets-release.cmake +29 -0
- halide/lib/cmake/Halide_Python/Halide_Python-targets.cmake +125 -0
- halide/lib/cmake/Halide_Python/Halide_PythonConfig.cmake +26 -0
- halide/lib/cmake/Halide_Python/Halide_PythonConfigVersion.cmake +65 -0
- halide/share/doc/Halide/LICENSE.txt +233 -0
- halide/share/doc/Halide/README.md +439 -0
- halide/share/doc/Halide/doc/BuildingHalideWithCMake.md +626 -0
- halide/share/doc/Halide/doc/CodeStyleCMake.md +393 -0
- halide/share/doc/Halide/doc/FuzzTesting.md +104 -0
- halide/share/doc/Halide/doc/HalideCMakePackage.md +812 -0
- halide/share/doc/Halide/doc/Hexagon.md +73 -0
- halide/share/doc/Halide/doc/Python.md +844 -0
- halide/share/doc/Halide/doc/RunGen.md +283 -0
- halide/share/doc/Halide/doc/Testing.md +125 -0
- halide/share/doc/Halide/doc/Vulkan.md +287 -0
- halide/share/doc/Halide/doc/WebAssembly.md +228 -0
- halide/share/doc/Halide/doc/WebGPU.md +128 -0
- halide/share/tools/RunGen.h +1470 -0
- halide/share/tools/RunGenMain.cpp +642 -0
- halide/share/tools/adams2019_autotune_loop.sh +227 -0
- halide/share/tools/anderson2021_autotune_loop.sh +591 -0
- halide/share/tools/halide_benchmark.h +240 -0
- halide/share/tools/halide_image.h +31 -0
- halide/share/tools/halide_image_info.h +318 -0
- halide/share/tools/halide_image_io.h +2794 -0
- halide/share/tools/halide_malloc_trace.h +102 -0
- halide/share/tools/halide_thread_pool.h +161 -0
- halide/share/tools/halide_trace_config.h +559 -0
- halide-19.0.0.data/data/share/cmake/Halide/HalideConfig.cmake +6 -0
- halide-19.0.0.data/data/share/cmake/Halide/HalideConfigVersion.cmake +65 -0
- halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfig.cmake +6 -0
- halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
- halide-19.0.0.dist-info/METADATA +301 -0
- halide-19.0.0.dist-info/RECORD +85 -0
- halide-19.0.0.dist-info/WHEEL +5 -0
- halide-19.0.0.dist-info/licenses/LICENSE.txt +233 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# Build the generator to autotune. This script will be autotuning the
|
4
|
+
# autoscheduler's cost model training pipeline, which is large enough
|
5
|
+
# to be interesting.
|
6
|
+
if [ $# -lt 6 -o $# -gt 8 ]; then
|
7
|
+
echo "Usage: $0 /path/to/some.generator generatorname halide_target weights_file autoschedule_bin_dir halide_distrib_path samples_out_path [generator_args_sets]"
|
8
|
+
exit
|
9
|
+
fi
|
10
|
+
|
11
|
+
set -eu
|
12
|
+
|
13
|
+
#trap "exit" INT TERM
|
14
|
+
#trap "kill 0" EXIT
|
15
|
+
|
16
|
+
GENERATOR=${1}
|
17
|
+
PIPELINE=${2}
|
18
|
+
HL_TARGET=${3}
|
19
|
+
START_WEIGHTS_FILE=${4}
|
20
|
+
AUTOSCHED_BIN=${5}
|
21
|
+
HALIDE_DISTRIB_PATH=${6}
|
22
|
+
SAMPLES=${7}
|
23
|
+
|
24
|
+
# Read the generator-arg sets into an array. Each set is delimited
|
25
|
+
# by space; multiple values within each set are are delimited with ;
|
26
|
+
# e.g. "set1arg1=1;set1arg2=foo set2=bar set3arg1=3.14;set4arg2=42"
|
27
|
+
if [ $# -ge 8 ]; then
|
28
|
+
IFS=' ' read -r -a GENERATOR_ARGS_SETS_ARRAY <<< "${8}"
|
29
|
+
else
|
30
|
+
declare -a GENERATOR_ARGS_SETS_ARRAY=
|
31
|
+
fi
|
32
|
+
|
33
|
+
# Ensure the length is at least 1
|
34
|
+
if [ ${#GENERATOR_ARGS_SETS_ARRAY[@]} -eq 0 ]; then
|
35
|
+
GENERATOR_ARGS_SETS_ARRAY=( '' )
|
36
|
+
fi
|
37
|
+
|
38
|
+
COMPILATION_TIMEOUT=600s
|
39
|
+
BENCHMARKING_TIMEOUT=60s
|
40
|
+
|
41
|
+
if [ -z ${HL_TARGET} ]; then
|
42
|
+
# Use the host target -- but remove features that we don't want to train
|
43
|
+
# for by default, at least not yet (most notably, AVX512).
|
44
|
+
HL_TARGET=`${AUTOSCHED_BIN}/get_host_target avx512 avx512_knl avx512_skylake avx512_cannonlake`
|
45
|
+
fi
|
46
|
+
echo Training target is: ${HL_TARGET}
|
47
|
+
|
48
|
+
if [ -z ${GENERATOR} ]; then
|
49
|
+
GENERATOR=./bin/adams2019_demo.generator
|
50
|
+
fi
|
51
|
+
|
52
|
+
if [ -z ${PIPELINE} ]; then
|
53
|
+
PIPELINE=demo
|
54
|
+
fi
|
55
|
+
|
56
|
+
mkdir -p ${SAMPLES}
|
57
|
+
|
58
|
+
WEIGHTS=${SAMPLES}/updated.weights
|
59
|
+
if [[ -f ${WEIGHTS} ]]; then
|
60
|
+
echo Using existing weights "${WEIGHTS}"
|
61
|
+
else
|
62
|
+
# Only copy over the weights if we don't have any already,
|
63
|
+
# so that restarted jobs can continue from where they left off
|
64
|
+
cp ${START_WEIGHTS_FILE} ${WEIGHTS}
|
65
|
+
echo Copying starting weights from ${START_WEIGHTS_FILE} to ${WEIGHTS}
|
66
|
+
fi
|
67
|
+
|
68
|
+
# A batch of this many samples is built in parallel, and then
|
69
|
+
# benchmarked serially.
|
70
|
+
BATCH_SIZE=32
|
71
|
+
|
72
|
+
TIMEOUT_CMD="timeout"
|
73
|
+
if [ $(uname -s) = "Darwin" ] && ! which $TIMEOUT_CMD 2>&1 >/dev/null; then
|
74
|
+
# OSX doesn't have timeout; gtimeout is equivalent and available via Homebrew
|
75
|
+
TIMEOUT_CMD="gtimeout"
|
76
|
+
if ! which $TIMEOUT_CMD 2>&1 >/dev/null; then
|
77
|
+
echo "Can't find the command 'gtimeout'. Run 'brew install coreutils' to install it."
|
78
|
+
exit 1
|
79
|
+
fi
|
80
|
+
fi
|
81
|
+
|
82
|
+
PLUGIN_EXT=so
|
83
|
+
|
84
|
+
# Build a single featurization of the pipeline with a random schedule
|
85
|
+
make_featurization() {
|
86
|
+
D=${1}
|
87
|
+
SEED=${2}
|
88
|
+
FNAME=${3}
|
89
|
+
EXTRA_GENERATOR_ARGS=${4}
|
90
|
+
mkdir -p ${D}
|
91
|
+
rm -f "${D}/${FNAME}.featurization"
|
92
|
+
rm -f "${D}/${FNAME}.sample"
|
93
|
+
if [[ $D == */0 ]]; then
|
94
|
+
# Sample 0 in each batch is best effort beam search, with no randomness
|
95
|
+
dropout=100
|
96
|
+
beam=32
|
97
|
+
else
|
98
|
+
# The other samples are random probes biased by the cost model
|
99
|
+
dropout=1 # 1% chance of operating entirely greedily
|
100
|
+
beam=1
|
101
|
+
fi
|
102
|
+
${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \
|
103
|
+
${GENERATOR} \
|
104
|
+
-g ${PIPELINE} \
|
105
|
+
-f ${FNAME} \
|
106
|
+
-o ${D} \
|
107
|
+
-e stmt,assembly,static_library,c_header,registration,schedule,featurization \
|
108
|
+
target=${HL_TARGET} \
|
109
|
+
${EXTRA_GENERATOR_ARGS} \
|
110
|
+
-p ${AUTOSCHED_BIN}/libautoschedule_adams2019.${PLUGIN_EXT} \
|
111
|
+
autoscheduler=Adams2019 \
|
112
|
+
autoscheduler.parallelism=32 \
|
113
|
+
autoscheduler.beam_size=${beam} \
|
114
|
+
autoscheduler.random_dropout=${dropout} \
|
115
|
+
autoscheduler.random_dropout_seed=${SEED} \
|
116
|
+
autoscheduler.weights_path=${WEIGHTS} \
|
117
|
+
2> ${D}/compile_log.txt || echo "Compilation failed or timed out for ${D}"
|
118
|
+
|
119
|
+
|
120
|
+
# We don't need image I/O for this purpose,
|
121
|
+
# so leave out libpng and libjpeg
|
122
|
+
c++ \
|
123
|
+
-std=c++17 \
|
124
|
+
-I ${HALIDE_DISTRIB_PATH}/include \
|
125
|
+
${HALIDE_DISTRIB_PATH}/tools/RunGenMain.cpp \
|
126
|
+
${D}/*.registration.cpp \
|
127
|
+
${D}/*.a \
|
128
|
+
-o ${D}/bench \
|
129
|
+
-DHALIDE_NO_PNG -DHALIDE_NO_JPEG \
|
130
|
+
-ldl -lpthread
|
131
|
+
}
|
132
|
+
|
133
|
+
# Benchmark one of the random samples
|
134
|
+
benchmark_sample() {
|
135
|
+
sleep 1 # Give CPU clocks a chance to spin back up if we're thermally throttling
|
136
|
+
D=${1}
|
137
|
+
HL_NUM_THREADS=32 \
|
138
|
+
${TIMEOUT_CMD} -k ${BENCHMARKING_TIMEOUT} ${BENCHMARKING_TIMEOUT} \
|
139
|
+
${D}/bench \
|
140
|
+
--estimate_all \
|
141
|
+
--benchmarks=all \
|
142
|
+
| tee ${D}/bench.txt || echo "Benchmarking failed or timed out for ${D}"
|
143
|
+
|
144
|
+
# Add the runtime, pipeline id, and schedule id to the feature file
|
145
|
+
R=$(cut -d' ' -f8 < ${D}/bench.txt)
|
146
|
+
P=$3
|
147
|
+
S=$2
|
148
|
+
FNAME=$4
|
149
|
+
${AUTOSCHED_BIN}/featurization_to_sample ${D}/${FNAME}.featurization $R $P $S ${D}/${FNAME}.sample || echo "featurization_to_sample failed for ${D} (probably because benchmarking failed)"
|
150
|
+
}
|
151
|
+
|
152
|
+
# Don't clobber existing samples
|
153
|
+
FIRST=$(ls -d ${SAMPLES}/batch_* 2>/dev/null | sed -e "s|.*/batch_||;s|_.*||" | sort -n | tail -n1)
|
154
|
+
|
155
|
+
if [ $(uname -s) = "Darwin" ]; then
|
156
|
+
LOCAL_CORES=`sysctl -n hw.ncpu`
|
157
|
+
else
|
158
|
+
LOCAL_CORES=`nproc`
|
159
|
+
fi
|
160
|
+
echo Local number of cores detected as ${LOCAL_CORES}
|
161
|
+
|
162
|
+
NUM_BATCHES=1
|
163
|
+
|
164
|
+
for ((BATCH_ID=$((FIRST+1));BATCH_ID<$((FIRST+1+NUM_BATCHES));BATCH_ID++)); do
|
165
|
+
|
166
|
+
SECONDS=0
|
167
|
+
|
168
|
+
for ((EXTRA_ARGS_IDX=0;EXTRA_ARGS_IDX<${#GENERATOR_ARGS_SETS_ARRAY[@]};EXTRA_ARGS_IDX++)); do
|
169
|
+
|
170
|
+
# Compile a batch of samples using the generator in parallel
|
171
|
+
DIR=${SAMPLES}/batch_${BATCH_ID}_${EXTRA_ARGS_IDX}
|
172
|
+
|
173
|
+
# Copy the weights being used into the batch folder so that we can repro failures
|
174
|
+
mkdir -p ${DIR}/
|
175
|
+
cp ${WEIGHTS} ${DIR}/used.weights
|
176
|
+
|
177
|
+
EXTRA_GENERATOR_ARGS=${GENERATOR_ARGS_SETS_ARRAY[EXTRA_ARGS_IDX]/;/ }
|
178
|
+
if [ ! -z "${EXTRA_GENERATOR_ARGS}" ]; then
|
179
|
+
echo "Adding extra generator args (${EXTRA_GENERATOR_ARGS}) for batch_${BATCH_ID}"
|
180
|
+
fi
|
181
|
+
|
182
|
+
echo ${EXTRA_GENERATOR_ARGS} > ${DIR}/extra_generator_args.txt
|
183
|
+
|
184
|
+
# Do parallel compilation in batches, so that machines with fewer than BATCH_SIZE cores
|
185
|
+
# don't get swamped and timeout unnecessarily
|
186
|
+
echo -n Compiling ${BATCH_SIZE} samples
|
187
|
+
for ((SAMPLE_ID=0;SAMPLE_ID<${BATCH_SIZE};SAMPLE_ID++)); do
|
188
|
+
while [[ 1 ]]; do
|
189
|
+
RUNNING=$(jobs -r | wc -l)
|
190
|
+
if [[ RUNNING -ge LOCAL_CORES ]]; then
|
191
|
+
sleep 1
|
192
|
+
else
|
193
|
+
break
|
194
|
+
fi
|
195
|
+
done
|
196
|
+
|
197
|
+
S=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID)
|
198
|
+
FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID)
|
199
|
+
make_featurization "${DIR}/${SAMPLE_ID}" $S $FNAME "$EXTRA_GENERATOR_ARGS" &
|
200
|
+
echo -n .
|
201
|
+
done
|
202
|
+
wait
|
203
|
+
echo done.
|
204
|
+
|
205
|
+
# benchmark them serially using rungen
|
206
|
+
for ((SAMPLE_ID=0;SAMPLE_ID<${BATCH_SIZE};SAMPLE_ID++)); do
|
207
|
+
S=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID)
|
208
|
+
FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID)
|
209
|
+
benchmark_sample "${DIR}/${SAMPLE_ID}" $S $EXTRA_ARGS_IDX $FNAME
|
210
|
+
done
|
211
|
+
|
212
|
+
# retrain model weights on all samples seen so far
|
213
|
+
echo Retraining model...
|
214
|
+
|
215
|
+
find ${SAMPLES} -name "*.sample" | \
|
216
|
+
${AUTOSCHED_BIN}/adams2019_retrain_cost_model \
|
217
|
+
--epochs=${BATCH_SIZE} \
|
218
|
+
--rates="0.0001" \
|
219
|
+
--num_cores=32 \
|
220
|
+
--initial_weights=${WEIGHTS} \
|
221
|
+
--weights_out=${WEIGHTS} \
|
222
|
+
--best_benchmark=${SAMPLES}/best.${PIPELINE}.benchmark.txt \
|
223
|
+
--best_schedule=${SAMPLES}/best.${PIPELINE}.schedule.h
|
224
|
+
done
|
225
|
+
|
226
|
+
echo Batch ${BATCH_ID} took ${SECONDS} seconds to compile, benchmark, and retrain
|
227
|
+
done
|