voxcity 0.7.0__py3-none-any.whl → 1.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voxcity/__init__.py +14 -14
- voxcity/downloader/ocean.py +559 -0
- voxcity/exporter/__init__.py +12 -12
- voxcity/exporter/cityles.py +633 -633
- voxcity/exporter/envimet.py +733 -728
- voxcity/exporter/magicavoxel.py +333 -333
- voxcity/exporter/netcdf.py +238 -238
- voxcity/exporter/obj.py +1480 -1480
- voxcity/generator/__init__.py +47 -44
- voxcity/generator/api.py +727 -675
- voxcity/generator/grids.py +394 -379
- voxcity/generator/io.py +94 -94
- voxcity/generator/pipeline.py +582 -282
- voxcity/generator/update.py +429 -0
- voxcity/generator/voxelizer.py +18 -6
- voxcity/geoprocessor/__init__.py +75 -75
- voxcity/geoprocessor/draw.py +1494 -1219
- voxcity/geoprocessor/merge_utils.py +91 -91
- voxcity/geoprocessor/mesh.py +806 -806
- voxcity/geoprocessor/network.py +708 -708
- voxcity/geoprocessor/raster/__init__.py +2 -0
- voxcity/geoprocessor/raster/buildings.py +435 -428
- voxcity/geoprocessor/raster/core.py +31 -0
- voxcity/geoprocessor/raster/export.py +93 -93
- voxcity/geoprocessor/raster/landcover.py +178 -51
- voxcity/geoprocessor/raster/raster.py +1 -1
- voxcity/geoprocessor/utils.py +824 -824
- voxcity/models.py +115 -113
- voxcity/simulator/solar/__init__.py +66 -43
- voxcity/simulator/solar/integration.py +336 -336
- voxcity/simulator/solar/sky.py +668 -0
- voxcity/simulator/solar/temporal.py +792 -434
- voxcity/simulator_gpu/__init__.py +115 -0
- voxcity/simulator_gpu/common/__init__.py +9 -0
- voxcity/simulator_gpu/common/geometry.py +11 -0
- voxcity/simulator_gpu/core.py +322 -0
- voxcity/simulator_gpu/domain.py +262 -0
- voxcity/simulator_gpu/environment.yml +11 -0
- voxcity/simulator_gpu/init_taichi.py +154 -0
- voxcity/simulator_gpu/integration.py +15 -0
- voxcity/simulator_gpu/kernels.py +56 -0
- voxcity/simulator_gpu/radiation.py +28 -0
- voxcity/simulator_gpu/raytracing.py +623 -0
- voxcity/simulator_gpu/sky.py +9 -0
- voxcity/simulator_gpu/solar/__init__.py +178 -0
- voxcity/simulator_gpu/solar/core.py +66 -0
- voxcity/simulator_gpu/solar/csf.py +1249 -0
- voxcity/simulator_gpu/solar/domain.py +561 -0
- voxcity/simulator_gpu/solar/epw.py +421 -0
- voxcity/simulator_gpu/solar/integration.py +2953 -0
- voxcity/simulator_gpu/solar/radiation.py +3019 -0
- voxcity/simulator_gpu/solar/raytracing.py +686 -0
- voxcity/simulator_gpu/solar/reflection.py +533 -0
- voxcity/simulator_gpu/solar/sky.py +907 -0
- voxcity/simulator_gpu/solar/solar.py +337 -0
- voxcity/simulator_gpu/solar/svf.py +446 -0
- voxcity/simulator_gpu/solar/volumetric.py +1151 -0
- voxcity/simulator_gpu/solar/voxcity.py +2953 -0
- voxcity/simulator_gpu/temporal.py +13 -0
- voxcity/simulator_gpu/utils.py +25 -0
- voxcity/simulator_gpu/view.py +32 -0
- voxcity/simulator_gpu/visibility/__init__.py +109 -0
- voxcity/simulator_gpu/visibility/geometry.py +278 -0
- voxcity/simulator_gpu/visibility/integration.py +808 -0
- voxcity/simulator_gpu/visibility/landmark.py +753 -0
- voxcity/simulator_gpu/visibility/view.py +944 -0
- voxcity/utils/__init__.py +11 -0
- voxcity/utils/classes.py +194 -0
- voxcity/utils/lc.py +80 -39
- voxcity/utils/shape.py +230 -0
- voxcity/visualizer/__init__.py +24 -24
- voxcity/visualizer/builder.py +43 -43
- voxcity/visualizer/grids.py +141 -141
- voxcity/visualizer/maps.py +187 -187
- voxcity/visualizer/renderer.py +1146 -928
- {voxcity-0.7.0.dist-info → voxcity-1.0.13.dist-info}/METADATA +56 -52
- voxcity-1.0.13.dist-info/RECORD +116 -0
- voxcity-0.7.0.dist-info/RECORD +0 -77
- {voxcity-0.7.0.dist-info → voxcity-1.0.13.dist-info}/WHEEL +0 -0
- {voxcity-0.7.0.dist-info → voxcity-1.0.13.dist-info}/licenses/AUTHORS.rst +0 -0
- {voxcity-0.7.0.dist-info → voxcity-1.0.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimized Radiation Computation for GPU
|
|
3
|
+
|
|
4
|
+
This module provides optimized GPU kernels for radiation computation
|
|
5
|
+
that minimize kernel launches and synchronization overhead.
|
|
6
|
+
|
|
7
|
+
Key optimizations:
|
|
8
|
+
1. Fused kernels - combine multiple operations into single kernel launches
|
|
9
|
+
2. Reduced synchronization - batch operations to minimize ti.sync() calls
|
|
10
|
+
3. Better memory access patterns - coalesced memory access
|
|
11
|
+
4. Reduced atomic operations - use local accumulation where possible
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import taichi as ti
|
|
15
|
+
import numpy as np
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
# Vector type
|
|
19
|
+
Vector3 = ti.math.vec3
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ti.data_oriented
|
|
23
|
+
class OptimizedReflectionSolver:
|
|
24
|
+
"""
|
|
25
|
+
Optimized GPU solver for multi-bounce radiation reflections.
|
|
26
|
+
|
|
27
|
+
This replaces the per-step kernel launches with fused operations
|
|
28
|
+
that run the entire reflection loop on GPU without CPU intervention.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
n_surfaces: int,
|
|
34
|
+
max_svf_entries: int,
|
|
35
|
+
n_reflection_steps: int = 3
|
|
36
|
+
):
|
|
37
|
+
self.n_surfaces = n_surfaces
|
|
38
|
+
self.max_svf_entries = max_svf_entries
|
|
39
|
+
self.n_reflection_steps = n_reflection_steps
|
|
40
|
+
|
|
41
|
+
# Preallocate ping-pong buffers for reflection iterations
|
|
42
|
+
# This avoids memory allocation during computation
|
|
43
|
+
self._surfins_a = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
44
|
+
self._surfins_b = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
45
|
+
self._surfout = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
46
|
+
|
|
47
|
+
# Accumulated totals
|
|
48
|
+
self._total_incoming = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
49
|
+
self._total_outgoing = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
50
|
+
|
|
51
|
+
@ti.kernel
|
|
52
|
+
def solve_reflections_fused(
|
|
53
|
+
self,
|
|
54
|
+
# Initial radiation
|
|
55
|
+
initial_sw: ti.template(),
|
|
56
|
+
# Surface properties
|
|
57
|
+
albedo: ti.template(),
|
|
58
|
+
svf: ti.template(),
|
|
59
|
+
# Cached SVF matrix (sparse COO)
|
|
60
|
+
svf_source: ti.template(),
|
|
61
|
+
svf_target: ti.template(),
|
|
62
|
+
svf_vf: ti.template(),
|
|
63
|
+
svf_trans: ti.template(),
|
|
64
|
+
svf_nnz: ti.i32,
|
|
65
|
+
# Number of reflection steps
|
|
66
|
+
n_steps: ti.i32
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Fused kernel for complete multi-bounce reflection computation.
|
|
70
|
+
|
|
71
|
+
This runs all reflection iterations in a single kernel launch,
|
|
72
|
+
eliminating CPU-GPU synchronization overhead between steps.
|
|
73
|
+
|
|
74
|
+
Uses ping-pong buffers to avoid race conditions between iterations.
|
|
75
|
+
"""
|
|
76
|
+
n_surf = self.n_surfaces
|
|
77
|
+
|
|
78
|
+
# Initialize - copy initial radiation to buffer A and totals
|
|
79
|
+
for i in range(n_surf):
|
|
80
|
+
self._surfins_a[i] = initial_sw[i]
|
|
81
|
+
self._total_incoming[i] = initial_sw[i]
|
|
82
|
+
self._total_outgoing[i] = 0.0
|
|
83
|
+
|
|
84
|
+
# Sync after initialization
|
|
85
|
+
ti.sync()
|
|
86
|
+
|
|
87
|
+
# Reflection loop - alternate between buffers A and B
|
|
88
|
+
for step in range(n_steps):
|
|
89
|
+
# Determine which buffer is input and which is output
|
|
90
|
+
use_a_as_input = (step % 2 == 0)
|
|
91
|
+
|
|
92
|
+
# Phase 1: Compute outgoing = albedo * incoming
|
|
93
|
+
for i in range(n_surf):
|
|
94
|
+
if use_a_as_input:
|
|
95
|
+
self._surfout[i] = albedo[i] * self._surfins_a[i]
|
|
96
|
+
else:
|
|
97
|
+
self._surfout[i] = albedo[i] * self._surfins_b[i]
|
|
98
|
+
self._total_outgoing[i] += self._surfout[i]
|
|
99
|
+
|
|
100
|
+
ti.sync()
|
|
101
|
+
|
|
102
|
+
# Phase 2: Reset output buffer
|
|
103
|
+
for i in range(n_surf):
|
|
104
|
+
if use_a_as_input:
|
|
105
|
+
self._surfins_b[i] = 0.0
|
|
106
|
+
else:
|
|
107
|
+
self._surfins_a[i] = 0.0
|
|
108
|
+
|
|
109
|
+
ti.sync()
|
|
110
|
+
|
|
111
|
+
# Phase 3: Sparse matrix-vector multiply for reflection distribution
|
|
112
|
+
for idx in range(svf_nnz):
|
|
113
|
+
source = svf_source[idx]
|
|
114
|
+
target = svf_target[idx]
|
|
115
|
+
vf = svf_vf[idx]
|
|
116
|
+
trans = svf_trans[idx]
|
|
117
|
+
|
|
118
|
+
outgoing = self._surfout[source]
|
|
119
|
+
if outgoing > 0.01:
|
|
120
|
+
contribution = outgoing * vf * trans
|
|
121
|
+
if use_a_as_input:
|
|
122
|
+
ti.atomic_add(self._surfins_b[target], contribution)
|
|
123
|
+
else:
|
|
124
|
+
ti.atomic_add(self._surfins_a[target], contribution)
|
|
125
|
+
|
|
126
|
+
ti.sync()
|
|
127
|
+
|
|
128
|
+
# Phase 4: Apply urban view factor scaling and accumulate
|
|
129
|
+
for i in range(n_surf):
|
|
130
|
+
urban_vf = 1.0 - svf[i]
|
|
131
|
+
if urban_vf < 0.01:
|
|
132
|
+
if use_a_as_input:
|
|
133
|
+
self._surfins_b[i] = 0.0
|
|
134
|
+
else:
|
|
135
|
+
self._surfins_a[i] = 0.0
|
|
136
|
+
else:
|
|
137
|
+
if use_a_as_input:
|
|
138
|
+
self._surfins_b[i] *= urban_vf
|
|
139
|
+
self._total_incoming[i] += self._surfins_b[i]
|
|
140
|
+
else:
|
|
141
|
+
self._surfins_a[i] *= urban_vf
|
|
142
|
+
self._total_incoming[i] += self._surfins_a[i]
|
|
143
|
+
|
|
144
|
+
ti.sync()
|
|
145
|
+
|
|
146
|
+
def get_results(self):
|
|
147
|
+
"""Get accumulated totals as numpy arrays."""
|
|
148
|
+
return {
|
|
149
|
+
'total_incoming': self._total_incoming.to_numpy(),
|
|
150
|
+
'total_outgoing': self._total_outgoing.to_numpy()
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@ti.kernel
|
|
155
|
+
def fused_reflection_step_kernel(
|
|
156
|
+
# Current incoming radiation (input)
|
|
157
|
+
surfins_in: ti.template(),
|
|
158
|
+
# Next incoming radiation (output, will be accumulated)
|
|
159
|
+
surfins_out: ti.template(),
|
|
160
|
+
# Outgoing buffer (temporary)
|
|
161
|
+
surfout: ti.template(),
|
|
162
|
+
# Surface properties
|
|
163
|
+
albedo: ti.template(),
|
|
164
|
+
svf: ti.template(),
|
|
165
|
+
# Accumulated totals
|
|
166
|
+
total_incoming: ti.template(),
|
|
167
|
+
total_outgoing: ti.template(),
|
|
168
|
+
# Cached SVF matrix
|
|
169
|
+
svf_source: ti.template(),
|
|
170
|
+
svf_target: ti.template(),
|
|
171
|
+
svf_vf: ti.template(),
|
|
172
|
+
svf_trans: ti.template(),
|
|
173
|
+
svf_nnz: ti.i32,
|
|
174
|
+
n_surfaces: ti.i32
|
|
175
|
+
):
|
|
176
|
+
"""
|
|
177
|
+
Single fused kernel for one reflection step.
|
|
178
|
+
|
|
179
|
+
Combines: outgoing computation + distribution + accumulation
|
|
180
|
+
into fewer synchronization points.
|
|
181
|
+
"""
|
|
182
|
+
# Phase 1: Compute outgoing and reset output buffer
|
|
183
|
+
for i in range(n_surfaces):
|
|
184
|
+
out = albedo[i] * surfins_in[i]
|
|
185
|
+
surfout[i] = out
|
|
186
|
+
total_outgoing[i] += out
|
|
187
|
+
surfins_out[i] = 0.0
|
|
188
|
+
|
|
189
|
+
ti.sync()
|
|
190
|
+
|
|
191
|
+
# Phase 2: Sparse matrix-vector multiply
|
|
192
|
+
for idx in range(svf_nnz):
|
|
193
|
+
source = svf_source[idx]
|
|
194
|
+
target = svf_target[idx]
|
|
195
|
+
vf = svf_vf[idx]
|
|
196
|
+
trans = svf_trans[idx]
|
|
197
|
+
|
|
198
|
+
outgoing = surfout[source]
|
|
199
|
+
if outgoing > 0.01:
|
|
200
|
+
ti.atomic_add(surfins_out[target], outgoing * vf * trans)
|
|
201
|
+
|
|
202
|
+
ti.sync()
|
|
203
|
+
|
|
204
|
+
# Phase 3: Apply urban VF scaling and accumulate
|
|
205
|
+
for i in range(n_surfaces):
|
|
206
|
+
urban_vf = 1.0 - svf[i]
|
|
207
|
+
if urban_vf < 0.01:
|
|
208
|
+
surfins_out[i] = 0.0
|
|
209
|
+
else:
|
|
210
|
+
surfins_out[i] *= urban_vf
|
|
211
|
+
total_incoming[i] += surfins_out[i]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@ti.kernel
|
|
215
|
+
def compute_initial_and_reflections_fused(
|
|
216
|
+
# Surface properties
|
|
217
|
+
surf_direction: ti.template(),
|
|
218
|
+
surf_svf: ti.template(),
|
|
219
|
+
surf_shadow: ti.template(),
|
|
220
|
+
surf_canopy_trans: ti.template(),
|
|
221
|
+
surf_albedo: ti.template(),
|
|
222
|
+
surf_normal: ti.template(),
|
|
223
|
+
# Sun properties
|
|
224
|
+
sun_dir_x: ti.f32,
|
|
225
|
+
sun_dir_y: ti.f32,
|
|
226
|
+
sun_dir_z: ti.f32,
|
|
227
|
+
cos_zenith: ti.f32,
|
|
228
|
+
# Radiation inputs
|
|
229
|
+
sw_direct: ti.f32,
|
|
230
|
+
sw_diffuse: ti.f32,
|
|
231
|
+
# SVF matrix
|
|
232
|
+
svf_source: ti.template(),
|
|
233
|
+
svf_target: ti.template(),
|
|
234
|
+
svf_vf: ti.template(),
|
|
235
|
+
svf_trans: ti.template(),
|
|
236
|
+
svf_nnz: ti.i32,
|
|
237
|
+
# Number of surfaces and reflection steps
|
|
238
|
+
n_surfaces: ti.i32,
|
|
239
|
+
n_ref_steps: ti.i32,
|
|
240
|
+
# Output arrays (preallocated)
|
|
241
|
+
sw_in_direct: ti.template(),
|
|
242
|
+
sw_in_diffuse: ti.template(),
|
|
243
|
+
sw_in_reflected: ti.template(),
|
|
244
|
+
sw_out_total: ti.template(),
|
|
245
|
+
# Temporary buffers (ping-pong)
|
|
246
|
+
surfins_a: ti.template(),
|
|
247
|
+
surfins_b: ti.template(),
|
|
248
|
+
surfout: ti.template()
|
|
249
|
+
):
|
|
250
|
+
"""
|
|
251
|
+
Fully fused kernel: initial radiation + all reflection iterations.
|
|
252
|
+
|
|
253
|
+
This is the most optimized version that runs everything in one kernel.
|
|
254
|
+
"""
|
|
255
|
+
min_stable_coszen = 0.0262
|
|
256
|
+
|
|
257
|
+
# ========== Phase 1: Initial radiation pass ==========
|
|
258
|
+
for i in range(n_surfaces):
|
|
259
|
+
direction = surf_direction[i]
|
|
260
|
+
svf_val = surf_svf[i]
|
|
261
|
+
shadow = surf_shadow[i]
|
|
262
|
+
canopy_trans = surf_canopy_trans[i]
|
|
263
|
+
|
|
264
|
+
# Get surface normal
|
|
265
|
+
normal_x, normal_y, normal_z = 0.0, 0.0, 0.0
|
|
266
|
+
if direction == 0: # Up
|
|
267
|
+
normal_z = 1.0
|
|
268
|
+
elif direction == 1: # Down
|
|
269
|
+
normal_z = -1.0
|
|
270
|
+
elif direction == 2: # North
|
|
271
|
+
normal_y = 1.0
|
|
272
|
+
elif direction == 3: # South
|
|
273
|
+
normal_y = -1.0
|
|
274
|
+
elif direction == 4: # East
|
|
275
|
+
normal_x = 1.0
|
|
276
|
+
elif direction == 5: # West
|
|
277
|
+
normal_x = -1.0
|
|
278
|
+
|
|
279
|
+
# Cosine of incidence
|
|
280
|
+
cos_inc = sun_dir_x * normal_x + sun_dir_y * normal_y + sun_dir_z * normal_z
|
|
281
|
+
cos_inc = ti.max(0.0, cos_inc)
|
|
282
|
+
|
|
283
|
+
# Direct radiation
|
|
284
|
+
sw_dir = 0.0
|
|
285
|
+
if cos_zenith > min_stable_coszen and shadow < 0.5:
|
|
286
|
+
sw_dir = sw_direct * cos_inc * canopy_trans
|
|
287
|
+
|
|
288
|
+
# Diffuse radiation
|
|
289
|
+
sw_dif = 0.0
|
|
290
|
+
if direction != 1: # Not downward
|
|
291
|
+
sw_dif = sw_diffuse * svf_val
|
|
292
|
+
|
|
293
|
+
# Store results
|
|
294
|
+
sw_in_direct[i] = sw_dir
|
|
295
|
+
sw_in_diffuse[i] = sw_dif
|
|
296
|
+
sw_in_reflected[i] = 0.0
|
|
297
|
+
sw_out_total[i] = 0.0
|
|
298
|
+
|
|
299
|
+
# Initialize reflection buffer
|
|
300
|
+
surfins_a[i] = sw_dir + sw_dif
|
|
301
|
+
|
|
302
|
+
ti.sync()
|
|
303
|
+
|
|
304
|
+
# ========== Phase 2: Reflection iterations ==========
|
|
305
|
+
for step in range(n_ref_steps):
|
|
306
|
+
use_a = (step % 2 == 0)
|
|
307
|
+
|
|
308
|
+
# Compute outgoing and reset next buffer
|
|
309
|
+
for i in range(n_surfaces):
|
|
310
|
+
if use_a:
|
|
311
|
+
surfout[i] = surf_albedo[i] * surfins_a[i]
|
|
312
|
+
surfins_b[i] = 0.0
|
|
313
|
+
else:
|
|
314
|
+
surfout[i] = surf_albedo[i] * surfins_b[i]
|
|
315
|
+
surfins_a[i] = 0.0
|
|
316
|
+
sw_out_total[i] += surfout[i]
|
|
317
|
+
|
|
318
|
+
ti.sync()
|
|
319
|
+
|
|
320
|
+
# Sparse matmul for reflection distribution
|
|
321
|
+
for idx in range(svf_nnz):
|
|
322
|
+
src = svf_source[idx]
|
|
323
|
+
tgt = svf_target[idx]
|
|
324
|
+
vf = svf_vf[idx]
|
|
325
|
+
trans = svf_trans[idx]
|
|
326
|
+
|
|
327
|
+
out_val = surfout[src]
|
|
328
|
+
if out_val > 0.01:
|
|
329
|
+
contrib = out_val * vf * trans
|
|
330
|
+
if use_a:
|
|
331
|
+
ti.atomic_add(surfins_b[tgt], contrib)
|
|
332
|
+
else:
|
|
333
|
+
ti.atomic_add(surfins_a[tgt], contrib)
|
|
334
|
+
|
|
335
|
+
ti.sync()
|
|
336
|
+
|
|
337
|
+
# Apply urban VF and accumulate to reflected
|
|
338
|
+
for i in range(n_surfaces):
|
|
339
|
+
urban_vf = 1.0 - surf_svf[i]
|
|
340
|
+
if urban_vf < 0.01:
|
|
341
|
+
if use_a:
|
|
342
|
+
surfins_b[i] = 0.0
|
|
343
|
+
else:
|
|
344
|
+
surfins_a[i] = 0.0
|
|
345
|
+
else:
|
|
346
|
+
if use_a:
|
|
347
|
+
surfins_b[i] *= urban_vf
|
|
348
|
+
sw_in_reflected[i] += surfins_b[i]
|
|
349
|
+
else:
|
|
350
|
+
surfins_a[i] *= urban_vf
|
|
351
|
+
sw_in_reflected[i] += surfins_a[i]
|
|
352
|
+
|
|
353
|
+
ti.sync()
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def benchmark_reflections():
|
|
357
|
+
"""Benchmark the reflection solver."""
|
|
358
|
+
import time
|
|
359
|
+
|
|
360
|
+
print("Creating test data...")
|
|
361
|
+
n_surfaces = 10000
|
|
362
|
+
svf_nnz = 500000 # 5% sparse
|
|
363
|
+
n_ref_steps = 3
|
|
364
|
+
|
|
365
|
+
# Create test arrays
|
|
366
|
+
initial_sw = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
367
|
+
albedo = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
368
|
+
svf = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
369
|
+
|
|
370
|
+
svf_source = ti.field(dtype=ti.i32, shape=(svf_nnz,))
|
|
371
|
+
svf_target = ti.field(dtype=ti.i32, shape=(svf_nnz,))
|
|
372
|
+
svf_vf = ti.field(dtype=ti.f32, shape=(svf_nnz,))
|
|
373
|
+
svf_trans = ti.field(dtype=ti.f32, shape=(svf_nnz,))
|
|
374
|
+
|
|
375
|
+
# Output buffers
|
|
376
|
+
surfins_a = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
377
|
+
surfins_b = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
378
|
+
surfout = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
379
|
+
total_incoming = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
380
|
+
total_outgoing = ti.field(dtype=ti.f32, shape=(n_surfaces,))
|
|
381
|
+
|
|
382
|
+
# Initialize with random data
|
|
383
|
+
np.random.seed(42)
|
|
384
|
+
initial_sw.from_numpy(np.random.rand(n_surfaces).astype(np.float32) * 500)
|
|
385
|
+
albedo.from_numpy(np.random.rand(n_surfaces).astype(np.float32) * 0.3 + 0.1)
|
|
386
|
+
svf.from_numpy(np.random.rand(n_surfaces).astype(np.float32) * 0.5 + 0.3)
|
|
387
|
+
|
|
388
|
+
svf_source.from_numpy(np.random.randint(0, n_surfaces, svf_nnz).astype(np.int32))
|
|
389
|
+
svf_target.from_numpy(np.random.randint(0, n_surfaces, svf_nnz).astype(np.int32))
|
|
390
|
+
svf_vf.from_numpy(np.random.rand(svf_nnz).astype(np.float32) * 0.1)
|
|
391
|
+
svf_trans.from_numpy(np.random.rand(svf_nnz).astype(np.float32) * 0.5 + 0.5)
|
|
392
|
+
|
|
393
|
+
# Warmup with separate kernel approach
|
|
394
|
+
print("Warming up...")
|
|
395
|
+
|
|
396
|
+
@ti.kernel
|
|
397
|
+
def init_step(initial: ti.template(), ins_a: ti.template(), tot_in: ti.template(), tot_out: ti.template(), n: ti.i32):
|
|
398
|
+
for i in range(n):
|
|
399
|
+
ins_a[i] = initial[i]
|
|
400
|
+
tot_in[i] = initial[i]
|
|
401
|
+
tot_out[i] = 0.0
|
|
402
|
+
|
|
403
|
+
@ti.kernel
|
|
404
|
+
def compute_outgoing_step(ins: ti.template(), out: ti.template(), alb: ti.template(), tot_out: ti.template(), n: ti.i32):
|
|
405
|
+
for i in range(n):
|
|
406
|
+
o = alb[i] * ins[i]
|
|
407
|
+
out[i] = o
|
|
408
|
+
tot_out[i] += o
|
|
409
|
+
|
|
410
|
+
@ti.kernel
|
|
411
|
+
def reset_buffer(buf: ti.template(), n: ti.i32):
|
|
412
|
+
for i in range(n):
|
|
413
|
+
buf[i] = 0.0
|
|
414
|
+
|
|
415
|
+
@ti.kernel
|
|
416
|
+
def sparse_matmul_step(
|
|
417
|
+
out: ti.template(),
|
|
418
|
+
ins_next: ti.template(),
|
|
419
|
+
src: ti.template(),
|
|
420
|
+
tgt: ti.template(),
|
|
421
|
+
vf: ti.template(),
|
|
422
|
+
trans: ti.template(),
|
|
423
|
+
nnz: ti.i32
|
|
424
|
+
):
|
|
425
|
+
for idx in range(nnz):
|
|
426
|
+
s = src[idx]
|
|
427
|
+
t = tgt[idx]
|
|
428
|
+
v = vf[idx]
|
|
429
|
+
tr = trans[idx]
|
|
430
|
+
o = out[s]
|
|
431
|
+
if o > 0.01:
|
|
432
|
+
ti.atomic_add(ins_next[t], o * v * tr)
|
|
433
|
+
|
|
434
|
+
@ti.kernel
|
|
435
|
+
def scale_and_accumulate(ins: ti.template(), svf_arr: ti.template(), tot_in: ti.template(), n: ti.i32):
|
|
436
|
+
for i in range(n):
|
|
437
|
+
urban_vf = 1.0 - svf_arr[i]
|
|
438
|
+
if urban_vf < 0.01:
|
|
439
|
+
ins[i] = 0.0
|
|
440
|
+
else:
|
|
441
|
+
ins[i] *= urban_vf
|
|
442
|
+
tot_in[i] += ins[i]
|
|
443
|
+
|
|
444
|
+
# Warmup run
|
|
445
|
+
init_step(initial_sw, surfins_a, total_incoming, total_outgoing, n_surfaces)
|
|
446
|
+
for step in range(n_ref_steps):
|
|
447
|
+
if step % 2 == 0:
|
|
448
|
+
compute_outgoing_step(surfins_a, surfout, albedo, total_outgoing, n_surfaces)
|
|
449
|
+
reset_buffer(surfins_b, n_surfaces)
|
|
450
|
+
sparse_matmul_step(surfout, surfins_b, svf_source, svf_target, svf_vf, svf_trans, svf_nnz)
|
|
451
|
+
scale_and_accumulate(surfins_b, svf, total_incoming, n_surfaces)
|
|
452
|
+
else:
|
|
453
|
+
compute_outgoing_step(surfins_b, surfout, albedo, total_outgoing, n_surfaces)
|
|
454
|
+
reset_buffer(surfins_a, n_surfaces)
|
|
455
|
+
sparse_matmul_step(surfout, surfins_a, svf_source, svf_target, svf_vf, svf_trans, svf_nnz)
|
|
456
|
+
scale_and_accumulate(surfins_a, svf, total_incoming, n_surfaces)
|
|
457
|
+
ti.sync()
|
|
458
|
+
|
|
459
|
+
# Benchmark with separate kernel launches (like current implementation)
|
|
460
|
+
print(f"\nBenchmarking SEPARATE KERNELS ({n_ref_steps} reflection steps)...")
|
|
461
|
+
n_iterations = 20
|
|
462
|
+
times_separate = []
|
|
463
|
+
|
|
464
|
+
for i in range(n_iterations):
|
|
465
|
+
t0 = time.perf_counter()
|
|
466
|
+
init_step(initial_sw, surfins_a, total_incoming, total_outgoing, n_surfaces)
|
|
467
|
+
for step in range(n_ref_steps):
|
|
468
|
+
if step % 2 == 0:
|
|
469
|
+
compute_outgoing_step(surfins_a, surfout, albedo, total_outgoing, n_surfaces)
|
|
470
|
+
reset_buffer(surfins_b, n_surfaces)
|
|
471
|
+
sparse_matmul_step(surfout, surfins_b, svf_source, svf_target, svf_vf, svf_trans, svf_nnz)
|
|
472
|
+
scale_and_accumulate(surfins_b, svf, total_incoming, n_surfaces)
|
|
473
|
+
else:
|
|
474
|
+
compute_outgoing_step(surfins_b, surfout, albedo, total_outgoing, n_surfaces)
|
|
475
|
+
reset_buffer(surfins_a, n_surfaces)
|
|
476
|
+
sparse_matmul_step(surfout, surfins_a, svf_source, svf_target, svf_vf, svf_trans, svf_nnz)
|
|
477
|
+
scale_and_accumulate(surfins_a, svf, total_incoming, n_surfaces)
|
|
478
|
+
ti.sync()
|
|
479
|
+
times_separate.append(time.perf_counter() - t0)
|
|
480
|
+
|
|
481
|
+
mean_sep = np.mean(times_separate) * 1000
|
|
482
|
+
min_sep = np.min(times_separate) * 1000
|
|
483
|
+
print(f" Mean time: {mean_sep:.2f}ms")
|
|
484
|
+
print(f" Min time: {min_sep:.2f}ms")
|
|
485
|
+
|
|
486
|
+
# Compare with fused version
|
|
487
|
+
print(f"\nBenchmarking FUSED KERNEL ({n_ref_steps} reflection steps)...")
|
|
488
|
+
solver = OptimizedReflectionSolver(n_surfaces, svf_nnz, n_ref_steps)
|
|
489
|
+
|
|
490
|
+
# Warmup fused
|
|
491
|
+
solver.solve_reflections_fused(
|
|
492
|
+
initial_sw, albedo, svf,
|
|
493
|
+
svf_source, svf_target, svf_vf, svf_trans,
|
|
494
|
+
svf_nnz, n_ref_steps
|
|
495
|
+
)
|
|
496
|
+
ti.sync()
|
|
497
|
+
|
|
498
|
+
times_fused = []
|
|
499
|
+
for i in range(n_iterations):
|
|
500
|
+
t0 = time.perf_counter()
|
|
501
|
+
solver.solve_reflections_fused(
|
|
502
|
+
initial_sw, albedo, svf,
|
|
503
|
+
svf_source, svf_target, svf_vf, svf_trans,
|
|
504
|
+
svf_nnz, n_ref_steps
|
|
505
|
+
)
|
|
506
|
+
ti.sync()
|
|
507
|
+
times_fused.append(time.perf_counter() - t0)
|
|
508
|
+
|
|
509
|
+
mean_fused = np.mean(times_fused) * 1000
|
|
510
|
+
min_fused = np.min(times_fused) * 1000
|
|
511
|
+
print(f" Mean time: {mean_fused:.2f}ms")
|
|
512
|
+
print(f" Min time: {min_fused:.2f}ms")
|
|
513
|
+
|
|
514
|
+
print(f"\n Surfaces: {n_surfaces}, SVF entries: {svf_nnz}")
|
|
515
|
+
print(f"\n Comparison: Separate={min_sep:.2f}ms, Fused={min_fused:.2f}ms")
|
|
516
|
+
if min_fused < min_sep:
|
|
517
|
+
print(f" Fused is {min_sep/min_fused:.2f}x faster")
|
|
518
|
+
else:
|
|
519
|
+
print(f" Separate is {min_fused/min_sep:.2f}x faster")
|
|
520
|
+
|
|
521
|
+
return times_separate, times_fused
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
if __name__ == "__main__":
|
|
525
|
+
# Test with GPU
|
|
526
|
+
print("="*60)
|
|
527
|
+
print("Testing Reflection Solver on GPU")
|
|
528
|
+
print("="*60)
|
|
529
|
+
ti.init(arch=ti.gpu, default_fp=ti.f32)
|
|
530
|
+
gpu_times = benchmark_reflections()
|
|
531
|
+
|
|
532
|
+
# Note: Can't reinitialize Taichi in same process for CPU comparison
|
|
533
|
+
print("\nNote: To compare with CPU, run with ti.init(arch=ti.cpu)")
|