psx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/psx/gte.rb ADDED
@@ -0,0 +1,775 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PSX
4
+ # Geometry Transformation Engine (COP2).
5
+ #
6
+ # 32 data registers, 32 control registers, ~24 commands.
7
+ # Register and flag semantics follow the Nocash PSX spec.
8
+ # Math is implemented straightforwardly (not bit-exact with the hardware UNR
9
+ # divide table), which is enough to get the BIOS boot animation moving.
10
+ class GTE
11
+ # FLAG bits
12
+ FLAG_IR0_SAT = 1 << 12
13
+ FLAG_SY2_SAT = 1 << 13
14
+ FLAG_SX2_SAT = 1 << 14
15
+ FLAG_MAC0_NEG = 1 << 15
16
+ FLAG_MAC0_POS = 1 << 16
17
+ FLAG_DIVIDE_OVERFLOW = 1 << 17
18
+ FLAG_SZ3_OTZ_SAT = 1 << 18
19
+ FLAG_COLOR_B_SAT = 1 << 19
20
+ FLAG_COLOR_G_SAT = 1 << 20
21
+ FLAG_COLOR_R_SAT = 1 << 21
22
+ FLAG_IR3_SAT = 1 << 22
23
+ FLAG_IR2_SAT = 1 << 23
24
+ FLAG_IR1_SAT = 1 << 24
25
+ FLAG_MAC3_NEG = 1 << 25
26
+ FLAG_MAC2_NEG = 1 << 26
27
+ FLAG_MAC1_NEG = 1 << 27
28
+ FLAG_MAC3_POS = 1 << 28
29
+ FLAG_MAC2_POS = 1 << 29
30
+ FLAG_MAC1_POS = 1 << 30
31
+ # Bit 31: OR of bits 30..23 and 18..13
32
+ FLAG_ERROR_MASK = ((0xFF) << 23) | ((0x3F) << 13)
33
+
34
+ def initialize
35
+ reset
36
+ end
37
+
38
+ def reset
39
+ # Data registers (logical)
40
+ @v = Array.new(3) { [0, 0, 0] } # V0..V2 (S16 X,Y,Z each)
41
+ @rgbc = [0, 0, 0, 0] # R, G, B, CODE (U8 each)
42
+ @otz = 0 # U16
43
+ @ir0 = 0 # S16
44
+ @ir1 = 0; @ir2 = 0; @ir3 = 0 # S16 each
45
+ @sxy = Array.new(3) { [0, 0] } # SXY0..SXY2 (S16 X,Y)
46
+ @sz = [0, 0, 0, 0] # SZ0..SZ3 (U16)
47
+ @rgb_fifo = Array.new(3) { [0, 0, 0, 0] } # RGB0..RGB2 (R,G,B,CD)
48
+ @res1 = 0 # prohibited
49
+ @mac0 = 0 # S32
50
+ @mac1 = 0; @mac2 = 0; @mac3 = 0 # S32 each (logically S44 between ops)
51
+ @lzcs = 0 # S32
52
+ @lzcr = 32 # leading sign-bit count of LZCS (1..32)
53
+
54
+ # Control registers (logical)
55
+ @rt = Array.new(3) { [0, 0, 0] } # Rotation matrix 3x3, S16
56
+ @tr = [0, 0, 0] # Translation, S32 each
57
+ @ls = Array.new(3) { [0, 0, 0] } # Light source matrix, S16
58
+ @bk = [0, 0, 0] # Background color, S32
59
+ @lc = Array.new(3) { [0, 0, 0] } # Light color matrix, S16
60
+ @fc = [0, 0, 0] # Far color, S32
61
+ @ofx = 0 # S32
62
+ @ofy = 0 # S32
63
+ @h = 0 # U16
64
+ @dqa = 0 # S16
65
+ @dqb = 0 # S32
66
+ @zsf3 = 0 # S16
67
+ @zsf4 = 0 # S16
68
+ @flag = 0 # U32
69
+ end
70
+
71
+ # --- Public register access ---------------------------------------------
72
+
73
+ def read_data(idx)
74
+ case idx & 0x1F
75
+ when 0 then pack_xy(@v[0][0], @v[0][1])
76
+ when 1 then to_u32(sign_extend16(@v[0][2]))
77
+ when 2 then pack_xy(@v[1][0], @v[1][1])
78
+ when 3 then to_u32(sign_extend16(@v[1][2]))
79
+ when 4 then pack_xy(@v[2][0], @v[2][1])
80
+ when 5 then to_u32(sign_extend16(@v[2][2]))
81
+ when 6 then pack_rgbc(@rgbc)
82
+ when 7 then @otz & 0xFFFF
83
+ when 8 then to_u32(sign_extend16(@ir0))
84
+ when 9 then to_u32(sign_extend16(@ir1))
85
+ when 10 then to_u32(sign_extend16(@ir2))
86
+ when 11 then to_u32(sign_extend16(@ir3))
87
+ when 12 then pack_xy(@sxy[0][0], @sxy[0][1])
88
+ when 13 then pack_xy(@sxy[1][0], @sxy[1][1])
89
+ when 14 then pack_xy(@sxy[2][0], @sxy[2][1])
90
+ when 15 then pack_xy(@sxy[2][0], @sxy[2][1]) # SXYP mirror of SXY2
91
+ when 16 then @sz[0] & 0xFFFF
92
+ when 17 then @sz[1] & 0xFFFF
93
+ when 18 then @sz[2] & 0xFFFF
94
+ when 19 then @sz[3] & 0xFFFF
95
+ when 20 then pack_rgbc(@rgb_fifo[0])
96
+ when 21 then pack_rgbc(@rgb_fifo[1])
97
+ when 22 then pack_rgbc(@rgb_fifo[2])
98
+ when 23 then @res1 & 0xFFFF_FFFF
99
+ when 24 then to_u32(@mac0)
100
+ when 25 then to_u32(@mac1)
101
+ when 26 then to_u32(@mac2)
102
+ when 27 then to_u32(@mac3)
103
+ when 28, 29 then pack_irgb
104
+ when 30 then to_u32(@lzcs)
105
+ when 31 then @lzcr & 0xFFFF_FFFF
106
+ end
107
+ end
108
+
109
+ def write_data(idx, value)
110
+ v = value & 0xFFFF_FFFF
111
+ case idx & 0x1F
112
+ when 0
113
+ @v[0][0] = sign16(v & 0xFFFF); @v[0][1] = sign16((v >> 16) & 0xFFFF)
114
+ when 1
115
+ @v[0][2] = sign16(v & 0xFFFF)
116
+ when 2
117
+ @v[1][0] = sign16(v & 0xFFFF); @v[1][1] = sign16((v >> 16) & 0xFFFF)
118
+ when 3
119
+ @v[1][2] = sign16(v & 0xFFFF)
120
+ when 4
121
+ @v[2][0] = sign16(v & 0xFFFF); @v[2][1] = sign16((v >> 16) & 0xFFFF)
122
+ when 5
123
+ @v[2][2] = sign16(v & 0xFFFF)
124
+ when 6
125
+ @rgbc = unpack_rgbc(v)
126
+ when 7
127
+ @otz = v & 0xFFFF
128
+ when 8 then @ir0 = sign16(v & 0xFFFF)
129
+ when 9 then @ir1 = sign16(v & 0xFFFF)
130
+ when 10 then @ir2 = sign16(v & 0xFFFF)
131
+ when 11 then @ir3 = sign16(v & 0xFFFF)
132
+ when 12
133
+ @sxy[0][0] = sign16(v & 0xFFFF); @sxy[0][1] = sign16((v >> 16) & 0xFFFF)
134
+ when 13
135
+ @sxy[1][0] = sign16(v & 0xFFFF); @sxy[1][1] = sign16((v >> 16) & 0xFFFF)
136
+ when 14
137
+ @sxy[2][0] = sign16(v & 0xFFFF); @sxy[2][1] = sign16((v >> 16) & 0xFFFF)
138
+ when 15
139
+ # SXYP: writing pushes the FIFO (SXY0 <- SXY1, SXY1 <- SXY2, SXY2 <- new)
140
+ @sxy[0] = @sxy[1]
141
+ @sxy[1] = @sxy[2]
142
+ @sxy[2] = [sign16(v & 0xFFFF), sign16((v >> 16) & 0xFFFF)]
143
+ when 16 then @sz[0] = v & 0xFFFF
144
+ when 17 then @sz[1] = v & 0xFFFF
145
+ when 18 then @sz[2] = v & 0xFFFF
146
+ when 19 then @sz[3] = v & 0xFFFF
147
+ when 20 then @rgb_fifo[0] = unpack_rgbc(v)
148
+ when 21 then @rgb_fifo[1] = unpack_rgbc(v)
149
+ when 22 then @rgb_fifo[2] = unpack_rgbc(v)
150
+ when 23 then @res1 = v
151
+ when 24 then @mac0 = sign32(v)
152
+ when 25 then @mac1 = sign32(v)
153
+ when 26 then @mac2 = sign32(v)
154
+ when 27 then @mac3 = sign32(v)
155
+ when 28
156
+ # IRGB write: unpack RGB555, each component << 7 into IR1/2/3
157
+ @ir1 = ((v >> 0) & 0x1F) << 7
158
+ @ir2 = ((v >> 5) & 0x1F) << 7
159
+ @ir3 = ((v >> 10) & 0x1F) << 7
160
+ when 29
161
+ # ORGB is read-only
162
+ when 30
163
+ @lzcs = sign32(v)
164
+ @lzcr = leading_sign_bits(@lzcs)
165
+ when 31
166
+ # LZCR is read-only
167
+ end
168
+ end
169
+
170
+ def read_control(idx)
171
+ case idx & 0x1F
172
+ when 0 then pack_xy(@rt[0][0], @rt[0][1])
173
+ when 1 then pack_xy(@rt[0][2], @rt[1][0])
174
+ when 2 then pack_xy(@rt[1][1], @rt[1][2])
175
+ when 3 then pack_xy(@rt[2][0], @rt[2][1])
176
+ when 4 then to_u32(sign_extend16(@rt[2][2]))
177
+ when 5 then to_u32(@tr[0])
178
+ when 6 then to_u32(@tr[1])
179
+ when 7 then to_u32(@tr[2])
180
+ when 8 then pack_xy(@ls[0][0], @ls[0][1])
181
+ when 9 then pack_xy(@ls[0][2], @ls[1][0])
182
+ when 10 then pack_xy(@ls[1][1], @ls[1][2])
183
+ when 11 then pack_xy(@ls[2][0], @ls[2][1])
184
+ when 12 then to_u32(sign_extend16(@ls[2][2]))
185
+ when 13 then to_u32(@bk[0])
186
+ when 14 then to_u32(@bk[1])
187
+ when 15 then to_u32(@bk[2])
188
+ when 16 then pack_xy(@lc[0][0], @lc[0][1])
189
+ when 17 then pack_xy(@lc[0][2], @lc[1][0])
190
+ when 18 then pack_xy(@lc[1][1], @lc[1][2])
191
+ when 19 then pack_xy(@lc[2][0], @lc[2][1])
192
+ when 20 then to_u32(sign_extend16(@lc[2][2]))
193
+ when 21 then to_u32(@fc[0])
194
+ when 22 then to_u32(@fc[1])
195
+ when 23 then to_u32(@fc[2])
196
+ when 24 then to_u32(@ofx)
197
+ when 25 then to_u32(@ofy)
198
+ # H is unsigned 16-bit, but hardware sign-extends on read.
199
+ when 26 then to_u32(sign_extend16(@h))
200
+ when 27 then to_u32(sign_extend16(@dqa))
201
+ when 28 then to_u32(@dqb)
202
+ when 29 then to_u32(sign_extend16(@zsf3))
203
+ when 30 then to_u32(sign_extend16(@zsf4))
204
+ when 31
205
+ # Bit 31 = OR of error-flag bits
206
+ flag = @flag & ~(1 << 31)
207
+ flag |= (1 << 31) if (flag & FLAG_ERROR_MASK) != 0
208
+ flag
209
+ end
210
+ end
211
+
212
+ def write_control(idx, value)
213
+ v = value & 0xFFFF_FFFF
214
+ case idx & 0x1F
215
+ when 0
216
+ @rt[0][0] = sign16(v & 0xFFFF); @rt[0][1] = sign16((v >> 16) & 0xFFFF)
217
+ when 1
218
+ @rt[0][2] = sign16(v & 0xFFFF); @rt[1][0] = sign16((v >> 16) & 0xFFFF)
219
+ when 2
220
+ @rt[1][1] = sign16(v & 0xFFFF); @rt[1][2] = sign16((v >> 16) & 0xFFFF)
221
+ when 3
222
+ @rt[2][0] = sign16(v & 0xFFFF); @rt[2][1] = sign16((v >> 16) & 0xFFFF)
223
+ when 4
224
+ @rt[2][2] = sign16(v & 0xFFFF)
225
+ when 5 then @tr[0] = sign32(v)
226
+ when 6 then @tr[1] = sign32(v)
227
+ when 7 then @tr[2] = sign32(v)
228
+ when 8
229
+ @ls[0][0] = sign16(v & 0xFFFF); @ls[0][1] = sign16((v >> 16) & 0xFFFF)
230
+ when 9
231
+ @ls[0][2] = sign16(v & 0xFFFF); @ls[1][0] = sign16((v >> 16) & 0xFFFF)
232
+ when 10
233
+ @ls[1][1] = sign16(v & 0xFFFF); @ls[1][2] = sign16((v >> 16) & 0xFFFF)
234
+ when 11
235
+ @ls[2][0] = sign16(v & 0xFFFF); @ls[2][1] = sign16((v >> 16) & 0xFFFF)
236
+ when 12 then @ls[2][2] = sign16(v & 0xFFFF)
237
+ when 13 then @bk[0] = sign32(v)
238
+ when 14 then @bk[1] = sign32(v)
239
+ when 15 then @bk[2] = sign32(v)
240
+ when 16
241
+ @lc[0][0] = sign16(v & 0xFFFF); @lc[0][1] = sign16((v >> 16) & 0xFFFF)
242
+ when 17
243
+ @lc[0][2] = sign16(v & 0xFFFF); @lc[1][0] = sign16((v >> 16) & 0xFFFF)
244
+ when 18
245
+ @lc[1][1] = sign16(v & 0xFFFF); @lc[1][2] = sign16((v >> 16) & 0xFFFF)
246
+ when 19
247
+ @lc[2][0] = sign16(v & 0xFFFF); @lc[2][1] = sign16((v >> 16) & 0xFFFF)
248
+ when 20 then @lc[2][2] = sign16(v & 0xFFFF)
249
+ when 21 then @fc[0] = sign32(v)
250
+ when 22 then @fc[1] = sign32(v)
251
+ when 23 then @fc[2] = sign32(v)
252
+ when 24 then @ofx = sign32(v)
253
+ when 25 then @ofy = sign32(v)
254
+ when 26 then @h = v & 0xFFFF
255
+ when 27 then @dqa = sign16(v & 0xFFFF)
256
+ when 28 then @dqb = sign32(v)
257
+ when 29 then @zsf3 = sign16(v & 0xFFFF)
258
+ when 30 then @zsf4 = sign16(v & 0xFFFF)
259
+ when 31
260
+ # Bit 31 is read-only (auto-OR). Bits 30..12 writable; lower bits zero.
261
+ @flag = v & 0x7FFF_F000
262
+ end
263
+ end
264
+
265
+ # --- Command dispatch ---------------------------------------------------
266
+
267
+ # Run a GTE command instruction (the lower 26 bits of the COP2 imm25 op).
268
+ def execute(instruction)
269
+ @flag = 0 # cleared at the start of each command
270
+ sf = ((instruction >> 19) & 1) != 0 # shift-fraction (12) flag
271
+ lm = ((instruction >> 10) & 1) != 0 # saturate IR1..3 to [0..7FFF]
272
+ mx = (instruction >> 17) & 3
273
+ mv = (instruction >> 15) & 3
274
+ cv = (instruction >> 13) & 3
275
+ shift = sf ? 12 : 0
276
+ opcode = instruction & 0x3F
277
+
278
+ case opcode
279
+ when 0x01 then cmd_rtps(0, shift, lm, push_sxy: true)
280
+ when 0x06 then cmd_nclip
281
+ when 0x0C then cmd_op(shift, lm)
282
+ when 0x10 then cmd_dpcs(shift, lm, @rgbc)
283
+ when 0x11 then cmd_intpl(shift, lm)
284
+ when 0x12 then cmd_mvmva(shift, lm, mx, mv, cv)
285
+ when 0x13 then cmd_ncds(0, shift, lm)
286
+ when 0x14 then cmd_cdp(shift, lm)
287
+ when 0x16 then cmd_ncdt(shift, lm)
288
+ when 0x1B then cmd_nccs(0, shift, lm)
289
+ when 0x1C then cmd_cc(shift, lm)
290
+ when 0x1E then cmd_ncs(0, shift, lm)
291
+ when 0x20 then cmd_nct(shift, lm)
292
+ when 0x28 then cmd_sqr(shift, lm)
293
+ when 0x29 then cmd_dcpl(shift, lm)
294
+ when 0x2A then cmd_dpct(shift, lm)
295
+ when 0x2D then cmd_avsz3
296
+ when 0x2E then cmd_avsz4
297
+ when 0x30 then cmd_rtpt(shift, lm)
298
+ when 0x3D then cmd_gpf(shift, lm)
299
+ when 0x3E then cmd_gpl(shift, lm)
300
+ when 0x3F then cmd_ncct(shift, lm)
301
+ else
302
+ # Unimplemented command: leave state alone; flag stays clear.
303
+ end
304
+
305
+ # Auto-set bit 31 (error flag)
306
+ @flag |= (1 << 31) if (@flag & FLAG_ERROR_MASK) != 0
307
+ end
308
+
309
+ # --- Helpers: bit-width conversion --------------------------------------
310
+
311
+ private
312
+
313
+ def to_u32(v); v & 0xFFFF_FFFF; end
314
+
315
+ def sign16(v)
316
+ v &= 0xFFFF
317
+ (v & 0x8000) != 0 ? v - 0x1_0000 : v
318
+ end
319
+
320
+ def sign32(v)
321
+ v &= 0xFFFF_FFFF
322
+ (v & 0x8000_0000) != 0 ? v - 0x1_0000_0000 : v
323
+ end
324
+
325
+ def sign_extend16(v)
326
+ v &= 0xFFFF
327
+ (v & 0x8000) != 0 ? (v | 0xFFFF_0000) : v
328
+ end
329
+
330
+ def pack_xy(x, y)
331
+ ((y & 0xFFFF) << 16) | (x & 0xFFFF)
332
+ end
333
+
334
+ def pack_rgbc(rgbc)
335
+ r, g, b, c = rgbc
336
+ ((c & 0xFF) << 24) | ((b & 0xFF) << 16) | ((g & 0xFF) << 8) | (r & 0xFF)
337
+ end
338
+
339
+ def unpack_rgbc(v)
340
+ [v & 0xFF, (v >> 8) & 0xFF, (v >> 16) & 0xFF, (v >> 24) & 0xFF]
341
+ end
342
+
343
+ def pack_irgb
344
+ r = (@ir1 >> 7).clamp(0, 0x1F)
345
+ g = (@ir2 >> 7).clamp(0, 0x1F)
346
+ b = (@ir3 >> 7).clamp(0, 0x1F)
347
+ (b << 10) | (g << 5) | r
348
+ end
349
+
350
+ # Count number of leading sign-bits in value (1..32). For LZCS/LZCR.
351
+ def leading_sign_bits(v)
352
+ v &= 0xFFFF_FFFF
353
+ bit = (v >> 31) & 1
354
+ count = 1
355
+ shifted = (v << 1) & 0xFFFF_FFFF
356
+ 31.times do
357
+ break if ((shifted >> 31) & 1) != bit
358
+ count += 1
359
+ shifted = (shifted << 1) & 0xFFFF_FFFF
360
+ end
361
+ count
362
+ end
363
+
364
+ # --- Helpers: saturation / overflow flags -------------------------------
365
+
366
+ # 43-bit signed overflow detection on MAC1/2/3 values (i = 1,2,3).
367
+ def check_mac_overflow(i, v)
368
+ lim = 1 << 43
369
+ pos_bit = case i; when 1 then FLAG_MAC1_POS; when 2 then FLAG_MAC2_POS; else FLAG_MAC3_POS; end
370
+ neg_bit = case i; when 1 then FLAG_MAC1_NEG; when 2 then FLAG_MAC2_NEG; else FLAG_MAC3_NEG; end
371
+ @flag |= pos_bit if v >= lim
372
+ @flag |= neg_bit if v < -lim
373
+ end
374
+
375
+ # 31-bit signed overflow detection on MAC0.
376
+ def check_mac0_overflow(v)
377
+ lim = 1 << 31
378
+ @flag |= FLAG_MAC0_POS if v >= lim
379
+ @flag |= FLAG_MAC0_NEG if v < -lim
380
+ end
381
+
382
+ # Saturate value to S16 range, with optional LM (clamp to [0..7FFF]).
383
+ # Returns saturated value and sets the appropriate IRx flag.
384
+ def sat_ir(i, v, lm)
385
+ lo = lm ? 0 : -0x8000
386
+ hi = 0x7FFF
387
+ bit = case i; when 1 then FLAG_IR1_SAT; when 2 then FLAG_IR2_SAT; else FLAG_IR3_SAT; end
388
+ if v < lo
389
+ @flag |= bit; lo
390
+ elsif v > hi
391
+ @flag |= bit; hi
392
+ else
393
+ v
394
+ end
395
+ end
396
+
397
+ def sat_ir0(v)
398
+ if v < 0
399
+ @flag |= FLAG_IR0_SAT; 0
400
+ elsif v > 0x1000
401
+ @flag |= FLAG_IR0_SAT; 0x1000
402
+ else
403
+ v
404
+ end
405
+ end
406
+
407
+ def sat_sz3(v)
408
+ if v < 0
409
+ @flag |= FLAG_SZ3_OTZ_SAT; 0
410
+ elsif v > 0xFFFF
411
+ @flag |= FLAG_SZ3_OTZ_SAT; 0xFFFF
412
+ else
413
+ v
414
+ end
415
+ end
416
+
417
+ def sat_sxy_x(v)
418
+ if v < -0x400
419
+ @flag |= FLAG_SX2_SAT; -0x400
420
+ elsif v > 0x3FF
421
+ @flag |= FLAG_SX2_SAT; 0x3FF
422
+ else
423
+ v
424
+ end
425
+ end
426
+
427
+ def sat_sxy_y(v)
428
+ if v < -0x400
429
+ @flag |= FLAG_SY2_SAT; -0x400
430
+ elsif v > 0x3FF
431
+ @flag |= FLAG_SY2_SAT; 0x3FF
432
+ else
433
+ v
434
+ end
435
+ end
436
+
437
+ def sat_color(channel, v)
438
+ bit = case channel; when 0 then FLAG_COLOR_R_SAT; when 1 then FLAG_COLOR_G_SAT; else FLAG_COLOR_B_SAT; end
439
+ if v < 0
440
+ @flag |= bit; 0
441
+ elsif v > 0xFF
442
+ @flag |= bit; 0xFF
443
+ else
444
+ v
445
+ end
446
+ end
447
+
448
+ # --- Helpers: FIFO pushes -----------------------------------------------
449
+
450
+ def push_sz(value)
451
+ @sz[0] = @sz[1]
452
+ @sz[1] = @sz[2]
453
+ @sz[2] = @sz[3]
454
+ @sz[3] = sat_sz3(value)
455
+ end
456
+
457
+ def push_sxy(x, y)
458
+ @sxy[0] = @sxy[1]
459
+ @sxy[1] = @sxy[2]
460
+ @sxy[2] = [sat_sxy_x(x), sat_sxy_y(y)]
461
+ end
462
+
463
+ def push_rgb_from_mac
464
+ r = sat_color(0, @mac1 >> 4)
465
+ g = sat_color(1, @mac2 >> 4)
466
+ b = sat_color(2, @mac3 >> 4)
467
+ @rgb_fifo[0] = @rgb_fifo[1]
468
+ @rgb_fifo[1] = @rgb_fifo[2]
469
+ @rgb_fifo[2] = [r, g, b, @rgbc[3]]
470
+ end
471
+
472
+ # --- Helpers: math primitives -------------------------------------------
473
+
474
+ # Simplified UNR divide: returns saturated quotient, sets divide flag on
475
+ # overflow. Real hardware uses an 8-bit Newton-Raphson approximation table;
476
+ # we just divide normally, which is close enough for boot.
477
+ def unr_divide
478
+ if @h < @sz[3] * 2 && @sz[3] != 0
479
+ n = ((@h.to_i * 0x20000 + (@sz[3] / 2)) / @sz[3])
480
+ return [n, 0x1FFFF].min
481
+ end
482
+ @flag |= FLAG_DIVIDE_OVERFLOW
483
+ 0x1FFFF
484
+ end
485
+
486
+ # MAC1/2/3 := (a + b + c + d) >> shift, then IR1/2/3 := saturate(MACi).
487
+ # Used by RTPS, MVMVA, etc. Each addition is checked for 43-bit overflow.
488
+ def mac_set(i, accum, shift, lm)
489
+ check_mac_overflow(i, accum)
490
+ result = accum >> shift
491
+ # Note: in real GTE the IR saturation is checked against the unshifted
492
+ # value when sf=0 (uses lm only). We approximate with the shifted value.
493
+ case i
494
+ when 1 then @mac1 = result & 0xFFFF_FFFF_FFFF_FFFF; @mac1 = sign_to_64(@mac1); @ir1 = sat_ir(1, result, lm)
495
+ when 2 then @mac2 = result & 0xFFFF_FFFF_FFFF_FFFF; @mac2 = sign_to_64(@mac2); @ir2 = sat_ir(2, result, lm)
496
+ when 3 then @mac3 = result & 0xFFFF_FFFF_FFFF_FFFF; @mac3 = sign_to_64(@mac3); @ir3 = sat_ir(3, result, lm)
497
+ end
498
+ # Truncate MAC to S32 for storage (hardware MAC is 32-bit visible)
499
+ case i
500
+ when 1 then @mac1 = sign32(@mac1 & 0xFFFF_FFFF)
501
+ when 2 then @mac2 = sign32(@mac2 & 0xFFFF_FFFF)
502
+ when 3 then @mac3 = sign32(@mac3 & 0xFFFF_FFFF)
503
+ end
504
+ result
505
+ end
506
+
507
+ def sign_to_64(v)
508
+ v &= 0xFFFF_FFFF_FFFF_FFFF
509
+ (v & 0x8000_0000_0000_0000) != 0 ? v - 0x1_0000_0000_0000_0000 : v
510
+ end
511
+
512
+ # --- Commands -----------------------------------------------------------
513
+
514
+ def cmd_rtps(vi, shift, lm, push_sxy:)
515
+ vx, vy, vz = @v[vi]
516
+
517
+ ax = (@tr[0] << 12) + @rt[0][0] * vx + @rt[0][1] * vy + @rt[0][2] * vz
518
+ ay = (@tr[1] << 12) + @rt[1][0] * vx + @rt[1][1] * vy + @rt[1][2] * vz
519
+ az = (@tr[2] << 12) + @rt[2][0] * vx + @rt[2][1] * vy + @rt[2][2] * vz
520
+
521
+ mac_set(1, ax, shift, lm)
522
+ mac_set(2, ay, shift, lm)
523
+ # SZ FIFO push uses the unshifted-by-sf result; if sf=0 we still shift by 12.
524
+ mac_set(3, az, shift, lm)
525
+ sz_value = (az >> 12)
526
+ push_sz(sz_value)
527
+
528
+ n = unr_divide
529
+
530
+ mac0 = n * @ir1 + @ofx
531
+ check_mac0_overflow(mac0)
532
+ sx = mac0 >> 16
533
+ mac0 = n * @ir2 + @ofy
534
+ check_mac0_overflow(mac0)
535
+ sy = mac0 >> 16
536
+ push_sxy(sx, sy) if push_sxy
537
+
538
+ mac0 = n * @dqa + @dqb
539
+ check_mac0_overflow(mac0)
540
+ @mac0 = sign32(mac0 & 0xFFFF_FFFF)
541
+ @ir0 = sat_ir0(mac0 >> 12)
542
+ end
543
+
544
+ def cmd_rtpt(shift, lm)
545
+ cmd_rtps(0, shift, lm, push_sxy: true)
546
+ cmd_rtps(1, shift, lm, push_sxy: true)
547
+ cmd_rtps(2, shift, lm, push_sxy: true)
548
+ end
549
+
550
+ def cmd_nclip
551
+ sx0, sy0 = @sxy[0]
552
+ sx1, sy1 = @sxy[1]
553
+ sx2, sy2 = @sxy[2]
554
+ result = sx0 * sy1 + sx1 * sy2 + sx2 * sy0 - sx0 * sy2 - sx1 * sy0 - sx2 * sy1
555
+ check_mac0_overflow(result)
556
+ @mac0 = sign32(result & 0xFFFF_FFFF)
557
+ end
558
+
559
+ def cmd_avsz3
560
+ sum = @sz[1] + @sz[2] + @sz[3]
561
+ result = @zsf3 * sum
562
+ check_mac0_overflow(result)
563
+ @mac0 = sign32(result & 0xFFFF_FFFF)
564
+ @otz = sat_sz3(result >> 12) & 0xFFFF
565
+ end
566
+
567
+ def cmd_avsz4
568
+ sum = @sz[0] + @sz[1] + @sz[2] + @sz[3]
569
+ result = @zsf4 * sum
570
+ check_mac0_overflow(result)
571
+ @mac0 = sign32(result & 0xFFFF_FFFF)
572
+ @otz = sat_sz3(result >> 12) & 0xFFFF
573
+ end
574
+
575
+ # MAC = M * V + T; IR = saturate(MAC)
576
+ def cmd_mvmva(shift, lm, mx_sel, mv_sel, cv_sel)
577
+ mx = case mx_sel
578
+ when 0 then @rt
579
+ when 1 then @ls
580
+ when 2 then @lc
581
+ else @rt # garbage matrix not modeled
582
+ end
583
+
584
+ mv = case mv_sel
585
+ when 0 then @v[0]
586
+ when 1 then @v[1]
587
+ when 2 then @v[2]
588
+ else [@ir1, @ir2, @ir3]
589
+ end
590
+
591
+ tv = case cv_sel
592
+ when 0 then @tr
593
+ when 1 then @bk
594
+ when 2 then @fc # bugged on real HW, we just use as translation
595
+ else [0, 0, 0]
596
+ end
597
+
598
+ a0 = (tv[0] << 12) + mx[0][0] * mv[0] + mx[0][1] * mv[1] + mx[0][2] * mv[2]
599
+ a1 = (tv[1] << 12) + mx[1][0] * mv[0] + mx[1][1] * mv[1] + mx[1][2] * mv[2]
600
+ a2 = (tv[2] << 12) + mx[2][0] * mv[0] + mx[2][1] * mv[1] + mx[2][2] * mv[2]
601
+
602
+ mac_set(1, a0, shift, lm)
603
+ mac_set(2, a1, shift, lm)
604
+ mac_set(3, a2, shift, lm)
605
+ end
606
+
607
+ def cmd_op(shift, lm)
608
+ d1 = @rt[0][0]; d2 = @rt[1][1]; d3 = @rt[2][2]
609
+ a1 = @ir3 * d2 - @ir2 * d3
610
+ a2 = @ir1 * d3 - @ir3 * d1
611
+ a3 = @ir2 * d1 - @ir1 * d2
612
+ mac_set(1, a1, shift, lm)
613
+ mac_set(2, a2, shift, lm)
614
+ mac_set(3, a3, shift, lm)
615
+ end
616
+
617
+ def cmd_sqr(shift, lm)
618
+ mac_set(1, @ir1 * @ir1, shift, lm)
619
+ mac_set(2, @ir2 * @ir2, shift, lm)
620
+ mac_set(3, @ir3 * @ir3, shift, lm)
621
+ end
622
+
623
+ def cmd_dpcs(shift, lm, color)
624
+ r, g, b, _cd = color
625
+ a1 = (r << 16) + (((@fc[0] << 12) - (r << 16)) >> shift) * 0 # simplified
626
+ # MAC = COLOR<<16; then interpolate toward FC by IR0.
627
+ mac1_in = r << 16
628
+ mac2_in = g << 16
629
+ mac3_in = b << 16
630
+ # Interpolation: MAC = MAC + (FC<<12 - MAC) * IR0 (approximation)
631
+ mac1_in += (((@fc[0] << 12) - mac1_in) >> shift) * @ir0
632
+ mac2_in += (((@fc[1] << 12) - mac2_in) >> shift) * @ir0
633
+ mac3_in += (((@fc[2] << 12) - mac3_in) >> shift) * @ir0
634
+ mac_set(1, mac1_in, shift, lm)
635
+ mac_set(2, mac2_in, shift, lm)
636
+ mac_set(3, mac3_in, shift, lm)
637
+ push_rgb_from_mac
638
+ end
639
+
640
+ def cmd_dpct(shift, lm)
641
+ 3.times { cmd_dpcs(shift, lm, @rgb_fifo[0]) }
642
+ end
643
+
644
+ def cmd_intpl(shift, lm)
645
+ mac1_in = (@ir1 << 12)
646
+ mac2_in = (@ir2 << 12)
647
+ mac3_in = (@ir3 << 12)
648
+ mac1_in += (((@fc[0] << 12) - mac1_in) >> shift) * @ir0
649
+ mac2_in += (((@fc[1] << 12) - mac2_in) >> shift) * @ir0
650
+ mac3_in += (((@fc[2] << 12) - mac3_in) >> shift) * @ir0
651
+ mac_set(1, mac1_in, shift, lm)
652
+ mac_set(2, mac2_in, shift, lm)
653
+ mac_set(3, mac3_in, shift, lm)
654
+ push_rgb_from_mac
655
+ end
656
+
657
+ # Normal color light source: MAC := LS * V
658
+ def light_normal(vi, shift, lm)
659
+ vx, vy, vz = @v[vi]
660
+ mac_set(1, @ls[0][0] * vx + @ls[0][1] * vy + @ls[0][2] * vz, shift, lm)
661
+ mac_set(2, @ls[1][0] * vx + @ls[1][1] * vy + @ls[1][2] * vz, shift, lm)
662
+ mac_set(3, @ls[2][0] * vx + @ls[2][1] * vy + @ls[2][2] * vz, shift, lm)
663
+ end
664
+
665
+ # Background + LightColor * IR
666
+ def light_color(shift, lm)
667
+ mac_set(1, (@bk[0] << 12) + @lc[0][0] * @ir1 + @lc[0][1] * @ir2 + @lc[0][2] * @ir3, shift, lm)
668
+ mac_set(2, (@bk[1] << 12) + @lc[1][0] * @ir1 + @lc[1][1] * @ir2 + @lc[1][2] * @ir3, shift, lm)
669
+ mac_set(3, (@bk[2] << 12) + @lc[2][0] * @ir1 + @lc[2][1] * @ir2 + @lc[2][2] * @ir3, shift, lm)
670
+ end
671
+
672
+ def cmd_ncs(vi, shift, lm)
673
+ light_normal(vi, shift, lm)
674
+ light_color(shift, lm)
675
+ push_rgb_from_mac
676
+ end
677
+
678
+ def cmd_nct(shift, lm)
679
+ cmd_ncs(0, shift, lm)
680
+ cmd_ncs(1, shift, lm)
681
+ cmd_ncs(2, shift, lm)
682
+ end
683
+
684
+ def cmd_nccs(vi, shift, lm)
685
+ light_normal(vi, shift, lm)
686
+ light_color(shift, lm)
687
+ r, g, b, _ = @rgbc
688
+ mac_set(1, (r * @ir1) << 4, shift, lm)
689
+ mac_set(2, (g * @ir2) << 4, shift, lm)
690
+ mac_set(3, (b * @ir3) << 4, shift, lm)
691
+ push_rgb_from_mac
692
+ end
693
+
694
+ def cmd_ncct(shift, lm)
695
+ cmd_nccs(0, shift, lm)
696
+ cmd_nccs(1, shift, lm)
697
+ cmd_nccs(2, shift, lm)
698
+ end
699
+
700
+ def cmd_ncds(vi, shift, lm)
701
+ light_normal(vi, shift, lm)
702
+ light_color(shift, lm)
703
+ r, g, b, _ = @rgbc
704
+ # Distance-color interpolation toward FC, weighted by IR0.
705
+ mac1_in = (r * @ir1) << 4
706
+ mac2_in = (g * @ir2) << 4
707
+ mac3_in = (b * @ir3) << 4
708
+ mac1_in += (((@fc[0] << 12) - mac1_in) >> shift) * @ir0
709
+ mac2_in += (((@fc[1] << 12) - mac2_in) >> shift) * @ir0
710
+ mac3_in += (((@fc[2] << 12) - mac3_in) >> shift) * @ir0
711
+ mac_set(1, mac1_in, shift, lm)
712
+ mac_set(2, mac2_in, shift, lm)
713
+ mac_set(3, mac3_in, shift, lm)
714
+ push_rgb_from_mac
715
+ end
716
+
717
+ def cmd_ncdt(shift, lm)
718
+ cmd_ncds(0, shift, lm)
719
+ cmd_ncds(1, shift, lm)
720
+ cmd_ncds(2, shift, lm)
721
+ end
722
+
723
+ def cmd_cc(shift, lm)
724
+ r, g, b, _ = @rgbc
725
+ light_color(shift, lm)
726
+ mac_set(1, (r * @ir1) << 4, shift, lm)
727
+ mac_set(2, (g * @ir2) << 4, shift, lm)
728
+ mac_set(3, (b * @ir3) << 4, shift, lm)
729
+ push_rgb_from_mac
730
+ end
731
+
732
+ def cmd_cdp(shift, lm)
733
+ r, g, b, _ = @rgbc
734
+ light_color(shift, lm)
735
+ mac1_in = (r * @ir1) << 4
736
+ mac2_in = (g * @ir2) << 4
737
+ mac3_in = (b * @ir3) << 4
738
+ mac1_in += (((@fc[0] << 12) - mac1_in) >> shift) * @ir0
739
+ mac2_in += (((@fc[1] << 12) - mac2_in) >> shift) * @ir0
740
+ mac3_in += (((@fc[2] << 12) - mac3_in) >> shift) * @ir0
741
+ mac_set(1, mac1_in, shift, lm)
742
+ mac_set(2, mac2_in, shift, lm)
743
+ mac_set(3, mac3_in, shift, lm)
744
+ push_rgb_from_mac
745
+ end
746
+
747
+ def cmd_dcpl(shift, lm)
748
+ r, g, b, _ = @rgbc
749
+ mac1_in = (r * @ir1) << 4
750
+ mac2_in = (g * @ir2) << 4
751
+ mac3_in = (b * @ir3) << 4
752
+ mac1_in += (((@fc[0] << 12) - mac1_in) >> shift) * @ir0
753
+ mac2_in += (((@fc[1] << 12) - mac2_in) >> shift) * @ir0
754
+ mac3_in += (((@fc[2] << 12) - mac3_in) >> shift) * @ir0
755
+ mac_set(1, mac1_in, shift, lm)
756
+ mac_set(2, mac2_in, shift, lm)
757
+ mac_set(3, mac3_in, shift, lm)
758
+ push_rgb_from_mac
759
+ end
760
+
761
+ def cmd_gpf(shift, lm)
762
+ mac_set(1, @ir0 * @ir1, shift, lm)
763
+ mac_set(2, @ir0 * @ir2, shift, lm)
764
+ mac_set(3, @ir0 * @ir3, shift, lm)
765
+ push_rgb_from_mac
766
+ end
767
+
768
+ def cmd_gpl(shift, lm)
769
+ mac_set(1, (@mac1 << shift) + @ir0 * @ir1, shift, lm)
770
+ mac_set(2, (@mac2 << shift) + @ir0 * @ir2, shift, lm)
771
+ mac_set(3, (@mac3 << shift) + @ir0 * @ir3, shift, lm)
772
+ push_rgb_from_mac
773
+ end
774
+ end
775
+ end