halide 19.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. halide/__init__.py +39 -0
  2. halide/_generator_helpers.py +835 -0
  3. halide/bin/Halide.dll +0 -0
  4. halide/bin/adams2019_retrain_cost_model.exe +0 -0
  5. halide/bin/adams2019_weightsdir_to_weightsfile.exe +0 -0
  6. halide/bin/anderson2021_retrain_cost_model.exe +0 -0
  7. halide/bin/anderson2021_weightsdir_to_weightsfile.exe +0 -0
  8. halide/bin/featurization_to_sample.exe +0 -0
  9. halide/bin/gengen.exe +0 -0
  10. halide/bin/get_host_target.exe +0 -0
  11. halide/halide_.cp313-win_amd64.pyd +0 -0
  12. halide/imageio.py +60 -0
  13. halide/include/Halide.h +35293 -0
  14. halide/include/HalideBuffer.h +2618 -0
  15. halide/include/HalidePyTorchCudaHelpers.h +64 -0
  16. halide/include/HalidePyTorchHelpers.h +120 -0
  17. halide/include/HalideRuntime.h +2221 -0
  18. halide/include/HalideRuntimeCuda.h +89 -0
  19. halide/include/HalideRuntimeD3D12Compute.h +91 -0
  20. halide/include/HalideRuntimeHexagonDma.h +104 -0
  21. halide/include/HalideRuntimeHexagonHost.h +157 -0
  22. halide/include/HalideRuntimeMetal.h +112 -0
  23. halide/include/HalideRuntimeOpenCL.h +119 -0
  24. halide/include/HalideRuntimeQurt.h +32 -0
  25. halide/include/HalideRuntimeVulkan.h +137 -0
  26. halide/include/HalideRuntimeWebGPU.h +44 -0
  27. halide/lib/Halide.lib +0 -0
  28. halide/lib/HalidePyStubs.lib +0 -0
  29. halide/lib/Halide_GenGen.lib +0 -0
  30. halide/lib/autoschedule_adams2019.dll +0 -0
  31. halide/lib/autoschedule_anderson2021.dll +0 -0
  32. halide/lib/autoschedule_li2018.dll +0 -0
  33. halide/lib/autoschedule_mullapudi2016.dll +0 -0
  34. halide/lib/cmake/Halide/FindHalide_LLVM.cmake +152 -0
  35. halide/lib/cmake/Halide/FindV8.cmake +33 -0
  36. halide/lib/cmake/Halide/Halide-shared-deps.cmake +0 -0
  37. halide/lib/cmake/Halide/Halide-shared-targets-release.cmake +29 -0
  38. halide/lib/cmake/Halide/Halide-shared-targets.cmake +154 -0
  39. halide/lib/cmake/Halide/HalideConfig.cmake +162 -0
  40. halide/lib/cmake/Halide/HalideConfigVersion.cmake +65 -0
  41. halide/lib/cmake/HalideHelpers/FindHalide_WebGPU.cmake +27 -0
  42. halide/lib/cmake/HalideHelpers/Halide-Interfaces-release.cmake +112 -0
  43. halide/lib/cmake/HalideHelpers/Halide-Interfaces.cmake +236 -0
  44. halide/lib/cmake/HalideHelpers/HalideGeneratorHelpers.cmake +1056 -0
  45. halide/lib/cmake/HalideHelpers/HalideHelpersConfig.cmake +28 -0
  46. halide/lib/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
  47. halide/lib/cmake/HalideHelpers/HalideTargetHelpers.cmake +99 -0
  48. halide/lib/cmake/HalideHelpers/MutexCopy.ps1 +31 -0
  49. halide/lib/cmake/HalideHelpers/TargetExportScript.cmake +55 -0
  50. halide/lib/cmake/Halide_Python/Halide_Python-targets-release.cmake +29 -0
  51. halide/lib/cmake/Halide_Python/Halide_Python-targets.cmake +125 -0
  52. halide/lib/cmake/Halide_Python/Halide_PythonConfig.cmake +26 -0
  53. halide/lib/cmake/Halide_Python/Halide_PythonConfigVersion.cmake +65 -0
  54. halide/share/doc/Halide/LICENSE.txt +233 -0
  55. halide/share/doc/Halide/README.md +439 -0
  56. halide/share/doc/Halide/doc/BuildingHalideWithCMake.md +626 -0
  57. halide/share/doc/Halide/doc/CodeStyleCMake.md +393 -0
  58. halide/share/doc/Halide/doc/FuzzTesting.md +104 -0
  59. halide/share/doc/Halide/doc/HalideCMakePackage.md +812 -0
  60. halide/share/doc/Halide/doc/Hexagon.md +73 -0
  61. halide/share/doc/Halide/doc/Python.md +844 -0
  62. halide/share/doc/Halide/doc/RunGen.md +283 -0
  63. halide/share/doc/Halide/doc/Testing.md +125 -0
  64. halide/share/doc/Halide/doc/Vulkan.md +287 -0
  65. halide/share/doc/Halide/doc/WebAssembly.md +228 -0
  66. halide/share/doc/Halide/doc/WebGPU.md +128 -0
  67. halide/share/tools/RunGen.h +1470 -0
  68. halide/share/tools/RunGenMain.cpp +642 -0
  69. halide/share/tools/adams2019_autotune_loop.sh +227 -0
  70. halide/share/tools/anderson2021_autotune_loop.sh +591 -0
  71. halide/share/tools/halide_benchmark.h +240 -0
  72. halide/share/tools/halide_image.h +31 -0
  73. halide/share/tools/halide_image_info.h +318 -0
  74. halide/share/tools/halide_image_io.h +2794 -0
  75. halide/share/tools/halide_malloc_trace.h +102 -0
  76. halide/share/tools/halide_thread_pool.h +161 -0
  77. halide/share/tools/halide_trace_config.h +559 -0
  78. halide-19.0.0.data/data/share/cmake/Halide/HalideConfig.cmake +6 -0
  79. halide-19.0.0.data/data/share/cmake/Halide/HalideConfigVersion.cmake +65 -0
  80. halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfig.cmake +6 -0
  81. halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
  82. halide-19.0.0.dist-info/METADATA +301 -0
  83. halide-19.0.0.dist-info/RECORD +85 -0
  84. halide-19.0.0.dist-info/WHEEL +5 -0
  85. halide-19.0.0.dist-info/licenses/LICENSE.txt +233 -0
@@ -0,0 +1,591 @@
1
+ #!/bin/bash
2
+
3
+ # Autotune the given generator
4
+ if [ $# -lt 7 -o $# -gt 8 ]; then
5
+ echo "Usage: $0 /path/to/some.generator generatorname halide_target weights_file halide_build_dir parallelism train_only [generator_args_sets]"
6
+ exit
7
+ fi
8
+
9
+ set -eu
10
+
11
+ if [ -z ${BASH_VERSION+x} ]; then
12
+ echo "${0} should be run as a bash script"
13
+ exit
14
+ fi
15
+
16
+ AUTOSCHEDULER_SRC_DIR=$(dirname $0)
17
+ SCRIPTS_DIR="${AUTOSCHEDULER_SRC_DIR}/scripts"
18
+ source ${SCRIPTS_DIR}/utils.sh
19
+
20
+ GENERATOR=${1}
21
+ PIPELINE=${2}
22
+ HL_TARGET=${3}
23
+ START_WEIGHTS_FILE=${4}
24
+ HALIDE_BUILD_DIR=${5}
25
+ PARALLELISM=${6}
26
+ TRAIN_ONLY=${7}
27
+
28
+ get_halide_src_dir ${AUTOSCHEDULER_SRC_DIR} HALIDE_SRC_DIR
29
+ get_autoscheduler_build_dir ${HALIDE_BUILD_DIR} AUTOSCHEDULER_BUILD_DIR
30
+ get_tools_build_dir ${HALIDE_BUILD_DIR} TOOLS_BUILD_DIR
31
+
32
+ LEARNING_RATE=${LEARNING_RATE:-0.001}
33
+
34
+ # Read the generator-arg sets into an array. Each set is delimited
35
+ # by space; multiple values within each set are are delimited with ;
36
+ # e.g. "set1arg1=1;set1arg2=foo set2=bar set3arg1=3.14;set4arg2=42"
37
+ if [ $# -ge 8 ]; then
38
+ IFS=' ' read -r -a GENERATOR_ARGS_SETS_ARRAY <<< "${8}"
39
+ else
40
+ declare -a GENERATOR_ARGS_SETS_ARRAY=
41
+ fi
42
+
43
+ # Ensure the length is at least 1
44
+ if [ ${#GENERATOR_ARGS_SETS_ARRAY[@]} -eq 0 ]; then
45
+ GENERATOR_ARGS_SETS_ARRAY=( '' )
46
+ fi
47
+
48
+ COMPILATION_TIMEOUT=600s
49
+ BENCHMARKING_TIMEOUT=10s
50
+
51
+ if [ -z ${CXX+x} ]; then
52
+ echo The CXX environment variable must be set. Exiting...
53
+ exit
54
+ fi
55
+
56
+ RUNGENMAIN="${TOOLS_BUILD_DIR}/RunGenMain.cpp.o"
57
+ if [ ! -f $RUNGENMAIN ]; then
58
+ echo "${RUNGENMAIN} not found. Exiting..."
59
+ exit
60
+ fi
61
+
62
+ echo Training target is: ${HL_TARGET}
63
+
64
+ if [ -z ${GENERATOR} ]; then
65
+ GENERATOR=./bin/anderson2021_demo.generator
66
+ fi
67
+
68
+ if [ -z ${PIPELINE} ]; then
69
+ PIPELINE=demo
70
+ fi
71
+
72
+ SEARCH_SPACE_OPTIONS=${SEARCH_SPACE_OPTIONS:-"1111"}
73
+
74
+ SAMPLES=${SAMPLES_DIR}
75
+ mkdir -p ${SAMPLES}
76
+
77
+ WEIGHTS=${SAMPLES}/updated.weights
78
+ if [[ -f ${WEIGHTS} ]]; then
79
+ echo Using existing weights "${WEIGHTS}"
80
+ else
81
+ # Only copy over the weights if we don't have any already,
82
+ # so that restarted jobs can continue from where they left off
83
+ cp ${START_WEIGHTS_FILE} ${WEIGHTS}
84
+ echo Copying starting weights from ${START_WEIGHTS_FILE} to ${WEIGHTS}
85
+ fi
86
+
87
+ # We could add this unconditionally, but it's easier to wade thru
88
+ # results if we only add if needed
89
+ #for F in disable_llvm_loop_opt; do
90
+ #if [[ ! ${HL_TARGET} =~ .*${F}.* ]]; then
91
+ #HL_TARGET="${HL_TARGET}-${F}"
92
+ #fi
93
+ #done
94
+
95
+ get_num_cpu_cores NUM_CPU_CORES
96
+ echo "Number of CPU cores detected as ${NUM_CPU_CORES}"
97
+
98
+ # A batch of this many samples is built in parallel, and then
99
+ # benchmarked serially.
100
+ BATCH_SIZE=80
101
+ EPOCHS=200
102
+
103
+ if ! command -v nvidia-smi > /dev/null; then
104
+ echo "nvidia-smi is required for autotuning"
105
+ exit
106
+ fi
107
+
108
+ NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
109
+
110
+ RANDOMIZE_TILINGS="${RANDOMIZE_TILINGS:-1}"
111
+ USE_FREEZE="${USE_FREEZE:-1}"
112
+
113
+ echo "Randomize tilings = ${RANDOMIZE_TILINGS}"
114
+ echo "Use freeze = ${USE_FREEZE}"
115
+ echo "# GPUs = ${NUM_GPUS}"
116
+
117
+ USE_BENCHMARK_QUEUE="${USE_BENCHMARK_QUEUE:-1}"
118
+ BENCHMARK_QUEUE_DIR=${SAMPLES}/benchmark_queue
119
+
120
+ RETRAIN_AFTER_EACH_BATCH=${RETRAIN_AFTER_EACH_BATCH:-1}
121
+ COMPILE_ONLY=${COMPILE_ONLY:-0}
122
+
123
+ if [[ $COMPILE_ONLY == 1 ]]; then
124
+ echo "Compile only: ON"
125
+ RETRAIN_AFTER_EACH_BATCH=0
126
+ USE_BENCHMARK_QUEUE=0
127
+ else
128
+ echo "Compile only: OFF"
129
+ fi
130
+
131
+ ENABLE_BEAM_SEARCH=${ENABLE_BEAM_SEARCH:-1}
132
+ if [[ ${ENABLE_BEAM_SEARCH} == 1 ]]; then
133
+ echo "Beam search: ON"
134
+ else
135
+ echo "Beam search: OFF"
136
+ fi
137
+
138
+ # Latest git hash
139
+ GIT_HASH=$(git rev-parse --verify HEAD)
140
+
141
+ if [[ $TRAIN_ONLY != 1 ]]; then
142
+ get_timeout_cmd TIMEOUT_CMD
143
+ else
144
+ echo "Train only mode: ON"
145
+ EPOCHS=10000
146
+ fi
147
+
148
+ record_command() {
149
+ BATCH=${1}
150
+ SAMPLE_ID=${2}
151
+ CMD=${3}
152
+ TXT=${4}
153
+ FAILED=${5}
154
+ BATCH_DIR=${SAMPLES}/${BATCH}
155
+
156
+ echo $CMD > ${BATCH_DIR}/${SAMPLE_ID}/${TXT}.txt
157
+
158
+ if [[ ${FAILED} == 1 && -f ${BATCH_DIR}/${SAMPLE_ID}/sample.sample ]]; then
159
+ # Delete the .sample file so it doesn't get included in re-training
160
+ rm -f ${BATCH_DIR}/${SAMPLE_ID}/sample.sample
161
+ fi
162
+ }
163
+
164
+ # Build a single featurization of the pipeline with a random schedule
165
+ make_featurization() {
166
+ D=${1}
167
+ RANDOM_DROPOUT_SEED=${2}
168
+ FNAME=${3}
169
+ EXTRA_GENERATOR_ARGS=${4}
170
+ BATCH=${5}
171
+ SAMPLE_ID=${6}
172
+ USED_WEIGHTS=${7}
173
+
174
+ mkdir -p ${D}
175
+ rm -f "${D}/${FNAME}.featurization"
176
+ rm -f "${D}/${FNAME}.sample"
177
+
178
+ if [[ $D == */0 && ${ENABLE_BEAM_SEARCH} == 1 ]]; then
179
+ # Sample 0 in each batch is best effort beam search, with no randomness
180
+ dropout=100
181
+ beam=32
182
+ else
183
+ # The other samples are random probes biased by the cost model
184
+ dropout=1 # 1% chance of operating entirely greedily
185
+ beam=1
186
+ fi
187
+
188
+ # TODO: make these arguments to this file
189
+ local -r shared_memory_limit=48
190
+ local -r shared_memory_sm_limit=96
191
+ local -r active_block_limit=32
192
+ local -r active_warp_limit=64
193
+
194
+ GPU=$((RANDOM % NUM_GPUS))
195
+ CMD="HL_DEBUG_AUTOSCHEDULE=1 \
196
+ /bin/time -f 'Compile time (s): %e' ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \
197
+ ${GENERATOR} \
198
+ -g ${PIPELINE} \
199
+ -f ${FNAME} \
200
+ -o ${D} \
201
+ -e stmt,assembly,static_library,c_header,registration,schedule,featurization \
202
+ target=${HL_TARGET} \
203
+ ${EXTRA_GENERATOR_ARGS} \
204
+ -p ${AUTOSCHEDULER_BUILD_DIR}/libautoschedule_anderson2021.so \
205
+ autoscheduler=Anderson2021 \
206
+ autoscheduler.parallelism=${PARALLELISM} \
207
+ autoscheduler.beam_size=${beam} \
208
+ autoscheduler.random_dropout=${dropout} \
209
+ autoscheduler.random_dropout_seed=${RANDOM_DROPOUT_SEED} \
210
+ autoscheduler.weights_path=${WEIGHTS} \
211
+ autoscheduler.randomize_tilings=${RANDOMIZE_TILINGS} \
212
+ autoscheduler.search_space_options=${SEARCH_SPACE_OPTIONS} \
213
+ autoscheduler.freeze_inline_compute_root=${USE_FREEZE} \
214
+ autoscheduler.shared_memory_limit_kb=${shared_memory_limit} \
215
+ autoscheduler.shared_memory_sm_limit_kb=${shared_memory_sm_limit} \
216
+ autoscheduler.active_block_limit=${active_block_limit} \
217
+ autoscheduler.active_warp_limit=${active_warp_limit} \
218
+ 2> ${D}/compile_err.txt > ${D}/compile_log.txt"
219
+
220
+ FAILED=0
221
+ eval $CMD || FAILED=1
222
+
223
+ echo "git rev-parse --verify HEAD = ${GIT_HASH}" >> ${D}/compile_err.txt
224
+
225
+ record_command $BATCH $SAMPLE_ID "${CMD/$WEIGHTS/$USED_WEIGHTS}" "autoschedule_command" $FAILED
226
+ if [[ $FAILED == 1 ]]; then
227
+ echo "Autoschedule failed or timed out for ${D}" | tee -a ${D}/compile_err.txt
228
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then
229
+ touch "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-failed"
230
+ fi
231
+ return
232
+ fi
233
+
234
+ LIBPNG_CFLAGS=$(libpng-config --cflags)
235
+ LIBPNG_LIBS=$(libpng-config --ldflags)
236
+ CMD="${CXX} \
237
+ -std=c++11 \
238
+ -O3
239
+ -I ../../include \
240
+ ${LIBPNG_CFLAGS} \
241
+ ${RUNGENMAIN} \
242
+ ${D}/*.registration.cpp \
243
+ ${D}/*.a \
244
+ -o ${D}/bench \
245
+ -ljpeg ${LIBPNG_LIBS} -ldl -lpthread"
246
+
247
+ eval $CMD
248
+ FAILED=0
249
+ if [[ $? != 0 ]]; then
250
+ echo "Compile failed ${D}" | tee -a ${D}/compile_err.txt
251
+ FAILED=1
252
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then
253
+ touch "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-failed"
254
+ fi
255
+ else
256
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then
257
+ touch "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}"
258
+ fi
259
+ fi
260
+
261
+ rm ${D}/${FNAME}.a
262
+ rm ${D}/${FNAME}.s
263
+ rm ${D}/${FNAME}.h
264
+ rm ${D}/${FNAME}.registration.cpp
265
+ rm ${D}/compile_log.txt
266
+ }
267
+
268
+ IMAGES_DIR="${HALIDE_SRC_DIR}/apps/images"
269
+
270
+ # Benchmark one of the random samples
271
+ benchmark_sample() {
272
+ D=${1}
273
+ BATCH=${3}
274
+ SAMPLE_ID=${4}
275
+ GPU_INDEX=${8}
276
+
277
+ if [[ ! -f ${D}/bench ]]; then
278
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then
279
+ mv "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-benchmarking-gpu_${GPU_INDEX}" "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-completed"
280
+ fi
281
+ return
282
+ fi
283
+
284
+ CMD="CUDA_VISIBLE_DEVICES=${GPU_INDEX} HL_NUM_THREADS=${PARALLELISM} \
285
+ ${TIMEOUT_CMD} -k ${BENCHMARKING_TIMEOUT} ${BENCHMARKING_TIMEOUT} \
286
+ ${D}/bench"
287
+
288
+ get_bench_args ${IMAGES_DIR} ${PIPELINE} ${D} BENCH_ARGS
289
+ CMD="${CMD} \
290
+ ${BENCH_ARGS} \
291
+ --benchmarks=all"
292
+
293
+ CMD="${CMD} 2> ${D}/bench_err.txt"
294
+
295
+ eval $CMD | tee ${D}/bench.txt
296
+
297
+ FAILED=0
298
+ if [[ ! -s ${D}/bench.txt ]]; then
299
+ echo "Benchmarking failed or timed out for ${D}"
300
+ FAILED=1
301
+ fi
302
+
303
+ record_command $BATCH $SAMPLE_ID "$CMD" "benchmark_command" $FAILED
304
+
305
+ if [[ ${FAILED} == 1 ]]; then
306
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then
307
+ mv "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-benchmarking-gpu_${GPU_INDEX}" "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-completed"
308
+ fi
309
+ return
310
+ fi
311
+
312
+ # Add the runtime, pipeline id, and schedule id to the feature file
313
+ R=$(cut -d' ' -f8 < ${D}/bench.txt)
314
+ P=$5
315
+ S=$2
316
+ FNAME=$6
317
+
318
+ ${AUTOSCHEDULER_BUILD_DIR}/featurization_to_sample ${D}/${FNAME}.featurization $R $P $S ${D}/${FNAME}.sample || echo "featurization_to_sample failed for ${D} (probably because benchmarking failed)"
319
+
320
+ rm ${D}/${FNAME}.featurization
321
+ rm ${D}/bench
322
+ rm ${D}/${FNAME}.stmt
323
+
324
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then
325
+ mv "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-benchmarking-gpu_${GPU_INDEX}" "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-completed"
326
+ fi
327
+ }
328
+
329
+ NUM_BATCHES=${NUM_BATCHES:-1}
330
+ TOTAL_NUM_SAMPLES=$((NUM_BATCHES*BATCH_SIZE*${#GENERATOR_ARGS_SETS_ARRAY[@]}))
331
+
332
+ echo "Num batches: ${NUM_BATCHES}"
333
+ echo "Total number of samples to be generated: ${TOTAL_NUM_SAMPLES}"
334
+
335
+ if [[ ${RETRAIN_AFTER_EACH_BATCH} == 1 ]]; then
336
+ NUM_SAMPLES_PER_QUEUE=$((BATCH_SIZE*${#GENERATOR_ARGS_SETS_ARRAY[@]}))
337
+ else
338
+ NUM_SAMPLES_PER_QUEUE=$((NUM_BATCHES*BATCH_SIZE*${#GENERATOR_ARGS_SETS_ARRAY[@]}))
339
+ fi
340
+
341
+ MAX_BENCHMARK_TIME=$((NUM_SAMPLES_PER_QUEUE*660))
342
+
343
+ echo "Number of samples per queue: ${NUM_SAMPLES_PER_QUEUE}"
344
+ echo "Max. benchmark time: ${MAX_BENCHMARK_TIME}"
345
+
346
+ echo "Retrain after each batch: ${RETRAIN_AFTER_EACH_BATCH}"
347
+
348
+ benchmark_loop() {
349
+ mkdir -p ${BENCHMARK_QUEUE_DIR}
350
+
351
+ START_TIME="$SECONDS"
352
+ MAX_TIME=${MAX_BENCHMARK_TIME}
353
+ sleep 1
354
+
355
+ echo "Starting benchmark loop for samples in ${SAMPLES}/*"
356
+ echo "Max. benchmark loop time = ${MAX_TIME} seconds"
357
+
358
+ local num_completed=0
359
+ while [[ 1 ]]; do
360
+ unset waitlist
361
+
362
+ for FILE in $(ls ${BENCHMARK_QUEUE_DIR}); do
363
+ if [[ $FILE == *"failed" ]]; then
364
+ # The sample failed to compile
365
+ num_completed=$((num_completed+1))
366
+ rm "${BENCHMARK_QUEUE_DIR}/${FILE}"
367
+ continue
368
+ fi
369
+
370
+ SAMPLE_ID=$(echo "${FILE}" | cut -d- -f 2)
371
+ BATCH=$(echo "${FILE}" | cut -d- -f 1)
372
+ SAMPLE_DIR="${SAMPLES}/${BATCH}/${SAMPLE_ID}"
373
+
374
+ # We sometimes encounter spurious permission denied errors. Usually,
375
+ # retrying will resolve them so remove from this file the
376
+ # '-completed' tag and let it be benchmarked again
377
+ if [[ -f "${SAMPLE_DIR}/bench_err.txt" ]]; then
378
+ if grep -q "Permission denied" "${SAMPLE_DIR}/bench_err.txt"; then
379
+ FILE=${FILE%-completed}
380
+ fi
381
+ fi
382
+
383
+ if [[ -f "${SAMPLE_DIR}/bench.txt" ]] && [[ $FILE == *"-completed" ]]; then
384
+ # Benchmarking has been completed
385
+ num_completed=$((num_completed+1))
386
+ rm "${BENCHMARK_QUEUE_DIR}/${FILE}"
387
+ continue
388
+ fi
389
+
390
+ if [[ $FILE == *"benchmarking"* ]]; then
391
+ # Sample is still benchmarking
392
+ continue
393
+ fi
394
+
395
+ BATCH_ID=$(echo "${BATCH}" | cut -d_ -f 2)
396
+ EXTRA_ARGS_IDX=$(echo "${BATCH}" | cut -d_ -f 3)
397
+ DIR=${SAMPLES}/${BATCH}
398
+
399
+ while [[ 1 ]]; do
400
+ if find_unused_gpu ${BENCHMARK_QUEUE_DIR} ${NUM_GPUS} gpu_id; then
401
+ S=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID)
402
+ FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID)
403
+ # Mark this file with gpu_${gpu_id} so we know that GPU is
404
+ # occupied
405
+ mv "${BENCHMARK_QUEUE_DIR}/${FILE}" "${BENCHMARK_QUEUE_DIR}/${FILE}-benchmarking-gpu_${gpu_id}"
406
+ benchmark_sample "${DIR}/${SAMPLE_ID}" $S $BATCH $SAMPLE_ID $EXTRA_ARGS_IDX $FNAME $BATCH_ID $gpu_id &
407
+ waitlist+=("$!")
408
+ break
409
+ else
410
+ # All GPUs are in use
411
+ sleep 0.1
412
+ fi
413
+ done
414
+ done
415
+
416
+ if [[ num_completed -eq NUM_SAMPLES_PER_QUEUE ]]; then
417
+ wait "${waitlist[@]}"
418
+ echo "Benchmarking complete."
419
+ break
420
+ fi
421
+
422
+ ELAPSED_TIME=$(("SECONDS"-START_TIME))
423
+ if [[ ELAPSED_TIME -ge MAX_TIME ]]; then
424
+ echo "Benchmark queue has been active for more than ${MAX_TIME} seconds. Exiting."
425
+ for pid in ${waitlist[@]}; do
426
+ kill $pid
427
+ done
428
+ break
429
+ fi
430
+ done
431
+
432
+ TOTAL_BENCHMARK_TIME=$(("SECONDS"-START_TIME))
433
+ echo "Benchmark time for batch: ${TOTAL_BENCHMARK_TIME}"
434
+ rm -rf ${BENCHMARK_QUEUE_DIR}
435
+ }
436
+
437
+ MAX_AUTOSCHEDULE_JOBS=${NUM_CPU_CORES}
438
+
439
+ BENCHMARK_QUEUE_ENABLED=0
440
+
441
+ if [[ $USE_BENCHMARK_QUEUE == 1 ]] && [[ $TRAIN_ONLY != 1 ]]; then
442
+ # Include 1 job for the benchmark loop
443
+ MAX_AUTOSCHEDULE_JOBS=$((NUM_CPU_CORES-NUM_GPUS-1))
444
+ if [[ MAX_AUTOSCHEDULE_JOBS -le 0 ]]; then
445
+ MAX_AUTOSCHEDULE_JOBS=${NUM_CPU_CORES}
446
+ echo "Not enough cores available to use the benchmark queue"
447
+ echo "Benchmark queue = OFF"
448
+ else
449
+ BENCHMARK_QUEUE_ENABLED=1
450
+ echo "Benchmark queue = ON"
451
+ fi
452
+ else
453
+ echo "Benchmark queue = OFF"
454
+ fi
455
+
456
+ echo "Max. concurrent autoschedule jobs = ${MAX_AUTOSCHEDULE_JOBS}"
457
+
458
+ SECONDS=0
459
+
460
+ if [[ $TRAIN_ONLY != 1 ]]; then
461
+ if [[ $BENCHMARK_QUEUE_ENABLED == 1 && $RETRAIN_AFTER_EACH_BATCH == 0 ]]; then
462
+ echo "Starting benchmark queue"
463
+ benchmark_loop &
464
+ benchmark_loop_pid=("$!")
465
+ echo "Starting PID: ${benchmark_loop_pid}"
466
+ fi
467
+
468
+ for ((BATCH_IDX=0;BATCH_IDX<${NUM_BATCHES};BATCH_IDX++)); do
469
+ if [[ $BENCHMARK_QUEUE_ENABLED == 1 && $RETRAIN_AFTER_EACH_BATCH == 1 ]]; then
470
+ echo "Starting benchmark queue"
471
+ benchmark_loop &
472
+ benchmark_loop_pid=("$!")
473
+ echo "Starting PID: ${benchmark_loop_pid}"
474
+ fi
475
+
476
+ while [[ 1 ]]; do
477
+ BATCH_ID=$(od -vAn -N3 -tu4 < /dev/urandom | awk '{print $1}')
478
+
479
+ if [ ! -d "${SAMPLES}/batch_${BATCH_ID}_0" ]; then
480
+ break
481
+ fi
482
+ done
483
+
484
+ echo "Starting compiling of new batch with id: ${BATCH_ID}"
485
+
486
+ for ((EXTRA_ARGS_IDX=0;EXTRA_ARGS_IDX<${#GENERATOR_ARGS_SETS_ARRAY[@]};EXTRA_ARGS_IDX++)); do
487
+ # Compile a batch of samples using the generator in parallel
488
+ BATCH=batch_${BATCH_ID}_${EXTRA_ARGS_IDX}_${RANDOMIZE_TILINGS}_${USE_FREEZE}
489
+ DIR=${SAMPLES}/${BATCH}
490
+
491
+ # Copy the weights being used into the batch folder so that we can repro failures
492
+ mkdir -p ${DIR}/
493
+ cp ${WEIGHTS} ${DIR}/used.weights
494
+
495
+ EXTRA_GENERATOR_ARGS=${GENERATOR_ARGS_SETS_ARRAY[EXTRA_ARGS_IDX]/;/ }
496
+
497
+ if [ ! -z "${EXTRA_GENERATOR_ARGS}" ]; then
498
+ echo "Adding extra generator args (${EXTRA_GENERATOR_ARGS}) for batch_${BATCH_ID}"
499
+ fi
500
+
501
+ echo ${EXTRA_GENERATOR_ARGS} > ${DIR}/extra_generator_args.txt
502
+
503
+ # Do parallel compilation in batches, so that machines with fewer than BATCH_SIZE cores
504
+ # don't get swamped and timeout unnecessarily
505
+ unset waitlist;
506
+ first=$(printf "%04d%04d" $BATCH_ID 0)
507
+ last=$(printf "%04d%04d" $BATCH_ID $(($BATCH_SIZE-1)))
508
+ echo Compiling ${BATCH_SIZE} samples from ${first} to ${last}
509
+ CUR_SECONDS="$SECONDS"
510
+ for ((SAMPLE_ID=0;SAMPLE_ID<${BATCH_SIZE};SAMPLE_ID++)); do
511
+ while [[ 1 ]]; do
512
+ RUNNING=$(jobs -r | wc -l)
513
+ if [[ RUNNING -ge MAX_AUTOSCHEDULE_JOBS ]]; then
514
+ sleep 1
515
+ else
516
+ break
517
+ fi
518
+ done
519
+
520
+ RANDOM_DROPOUT_SEED=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID)
521
+ FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID)
522
+ make_featurization "${DIR}/${SAMPLE_ID}" $RANDOM_DROPOUT_SEED $FNAME "$EXTRA_GENERATOR_ARGS" $BATCH $SAMPLE_ID ${DIR}/used.weights &
523
+ waitlist+=("$!")
524
+ done
525
+
526
+ # benchmark them serially using rungen
527
+ if [[ $USE_BENCHMARK_QUEUE == 0 && ${COMPILE_ONLY} == 0 ]]; then
528
+ wait "${waitlist[@]}"
529
+ COMPILE_TIME=$((SECONDS-CUR_SECONDS))
530
+ echo "Compile time for batch: ${COMPILE_TIME}"
531
+
532
+ CUR_SECONDS="$SECONDS"
533
+ for ((SAMPLE_ID=0;SAMPLE_ID<${BATCH_SIZE};SAMPLE_ID=SAMPLE_ID+NUM_GPUS)); do
534
+ for ((INDEX=0;INDEX<NUM_GPUS;INDEX++)); do
535
+ SAMPLE_ID_GPU=$((SAMPLE_ID + INDEX))
536
+ S=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID_GPU)
537
+ FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID_GPU)
538
+ benchmark_sample "${DIR}/${SAMPLE_ID_GPU}" $S $BATCH $SAMPLE_ID_GPU $EXTRA_ARGS_IDX $FNAME $BATCH_ID $INDEX &
539
+ done
540
+ wait
541
+ done
542
+ BENCHMARK_TIME=$((SECONDS-CUR_SECONDS))
543
+ echo "Benchmark time for batch: ${BENCHMARK_TIME}"
544
+ fi
545
+
546
+ if [[ ${COMPILE_ONLY} == 1 ]]; then
547
+ wait "${waitlist[@]}"
548
+ fi
549
+ done
550
+
551
+ if [[ ${RETRAIN_AFTER_EACH_BATCH} == 1 ]]; then
552
+ if [[ $BENCHMARK_QUEUE_ENABLED == 1 ]]; then
553
+ wait "${waitlist[@]}"
554
+ echo "Waiting for benchmarking to complete"
555
+ echo "Waiting PID: ${benchmark_loop_pid}"
556
+ wait "${benchmark_loop_pid}"
557
+ fi
558
+
559
+ CUR_SECONDS="$SECONDS"
560
+ retrain_cost_model ${HALIDE_BUILD_DIR} ${SAMPLES} ${WEIGHTS} ${PARALLELISM} ${EPOCHS} ${PIPELINE} ${LEARNING_RATE}
561
+ TRAIN_TIME=$((SECONDS-CUR_SECONDS))
562
+ echo "Train time for batch with ID = ${BATCH_ID}: ${TRAIN_TIME}"
563
+ fi
564
+ BATCH_ID=$((BATCH_ID+1))
565
+ done
566
+
567
+ if [[ ${BENCHMARK_QUEUE_ENABLED} == 1 && ${RETRAIN_AFTER_EACH_BATCH} == 0 ]]; then
568
+ wait "${waitlist[@]}"
569
+ echo "Waiting for benchmarking to complete"
570
+ echo "Waiting PID: ${benchmark_loop_pid}"
571
+ wait "${benchmark_loop_pid}"
572
+ fi
573
+ fi
574
+
575
+ if [[ ${RETRAIN_AFTER_EACH_BATCH} == 1 || ${COMPILE_ONLY} == 1 ]]; then
576
+ exit
577
+ fi
578
+
579
+ # retrain model weights on all samples seen so far
580
+ echo Retraining model...
581
+
582
+ CUR_SECONDS="$SECONDS"
583
+ retrain_cost_model ${HALIDE_SRC_DIR} ${SAMPLES} ${WEIGHTS} ${PARALLELISM} ${EPOCHS} ${PIPELINE} ${LEARNING_RATE}
584
+ TRAIN_TIME=$((SECONDS-CUR_SECONDS))
585
+ echo "Num batches = ${NUM_BATCHES}. Train time: ${TRAIN_TIME}"
586
+
587
+ if [[ $TRAIN_ONLY == 1 ]]; then
588
+ echo Num batches = ${NUM_BATCHES}. Took ${SECONDS} seconds to retrain
589
+ else
590
+ echo Num batches = ${NUM_BATCHES}. Took ${SECONDS} seconds to compile, benchmark, and retrain
591
+ fi