gym-csle-stopping-game 0.9.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,699 @@
1
+ from typing import Any, Tuple
2
+ import itertools
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+ from scipy.stats import betabinom
6
+ from gym_csle_stopping_game.dao.stopping_game_config import StoppingGameConfig
7
+ from csle_common.dao.training.policy import Policy
8
+
9
+
10
+ class StoppingGameUtil:
11
+ """
12
+ Class with utility functions for the StoppingGame Environment
13
+ """
14
+
15
+ @staticmethod
16
+ def b1() -> npt.NDArray[np.float64]:
17
+ """
18
+ Gets the initial belief
19
+
20
+ :return: the initial belief
21
+ """
22
+ return np.array([1.0, 0.0, 0.0])
23
+
24
+ @staticmethod
25
+ def state_space():
26
+ """
27
+ Gets the state space
28
+
29
+ :return: the state space of the game
30
+ """
31
+ return np.array([0, 1, 2])
32
+
33
+ @staticmethod
34
+ def defender_actions() -> npt.NDArray[np.int32]:
35
+ """
36
+ Gets the action space of the defender
37
+
38
+ :return: the action space of the defender
39
+ """
40
+ return np.array([0, 1])
41
+
42
+ @staticmethod
43
+ def attacker_actions() -> npt.NDArray[np.int32]:
44
+ """
45
+ Gets the action space of the attacker
46
+
47
+ :return: the action space of the attacker
48
+ """
49
+ return np.array([0, 1])
50
+
51
+ @staticmethod
52
+ def observation_space(n):
53
+ """
54
+ Returns the observation space of size n
55
+
56
+ :param n: the maximum observation
57
+ :return: the observation space
58
+ """
59
+ return np.array(list(range(n + 1)))
60
+
61
+ @staticmethod
62
+ def reward_tensor(R_SLA: int, R_INT: int, R_COST: int, L: int, R_ST: int) -> npt.NDArray[Any]:
63
+ """
64
+ Gets the reward tensor
65
+
66
+ :param R_SLA: the R_SLA constant
67
+ :param R_INT: the R_INT constant
68
+ :param R_COST: the R_COST constant
69
+ :param R_ST: the R_ST constant
70
+ :return: a |L|x|A1|x|A2|x|S| tensor
71
+ """
72
+
73
+ R_l = []
74
+ for l in range(1, L + 1):
75
+ R = [
76
+ # Defender continues
77
+ [
78
+ # Attacker continues
79
+ [R_SLA, R_SLA + R_INT, 0],
80
+ # Attacker stops
81
+ [R_SLA, R_SLA, 0]
82
+ ],
83
+ # Defender stops
84
+ [
85
+ # Attacker continues
86
+ [R_COST / l, R_ST / l, 0],
87
+ # Attacker stops
88
+ [R_COST / l, R_SLA, 0]
89
+ ]
90
+ ]
91
+ R_l.append(R)
92
+ return np.array(R_l)
93
+
94
+ @staticmethod
95
+ def transition_tensor(L: int) -> npt.NDArray[Any]:
96
+ """
97
+ Gets the transition tensor
98
+
99
+ :param L: the maximum number of stop actions
100
+ :return: a |L|x|A1|x|A2||S|^2 tensor
101
+ """
102
+ T_l = []
103
+ for l in range(1, L + 1):
104
+ if l == 1:
105
+ T = [
106
+ # Defender continues
107
+ [
108
+ # Attacker continues
109
+ [
110
+ [1.0, 0.0, 0.0], # No intrusion
111
+ [0.0, 1.0, 0.0], # Intrusion
112
+ [0.0, 0.0, 1.0] # Terminal
113
+ ],
114
+ # Attacker stops
115
+ [
116
+ [0.0, 1.0, 0.0], # No intrusion
117
+ [0.0, 0.0, 1.0], # Intrusion
118
+ [0.0, 0.0, 1.0] # Terminal
119
+ ]
120
+ ],
121
+
122
+ # Defender stops
123
+ [
124
+ # Attacker continues
125
+ [
126
+ [0.0, 0.0, 1.0], # No intrusion
127
+ [0.0, 0.0, 1.0], # Intrusion
128
+ [0.0, 0.0, 1.0] # Terminal
129
+ ],
130
+ # Attacker stops
131
+ [
132
+ [0.0, 0.0, 1.0], # No Intrusion
133
+ [0.0, 0.0, 1.0], # Intrusion
134
+ [0.0, 0.0, 1.0] # Terminal
135
+ ]
136
+ ]
137
+ ]
138
+ else:
139
+ T = [
140
+ # Defender continues
141
+ [
142
+ # Attacker continues
143
+ [
144
+ [1.0, 0.0, 0.0], # No intrusion
145
+ [0.0, 1.0 - 1.0 / (2.0 * l), 1.0 / (2.0 * l)], # Intrusion
146
+ [0.0, 0.0, 1.0] # Terminal
147
+ ],
148
+ # Attacker stops
149
+ [
150
+ [0.0, 1.0, 0.0], # No intrusion
151
+ [0.0, 0.0, 1.0], # Intrusion
152
+ [0.0, 0.0, 1.0] # Terminal
153
+ ]
154
+ ],
155
+
156
+ # Defender stops
157
+ [
158
+ # Attacker continues
159
+ [
160
+ [1.0, 0.0, 0.0], # No intrusion
161
+ [0.0, 1.0 - 1.0 / (2.0 * l), 1.0 / (2.0 * l)], # Intrusion
162
+ [0.0, 0.0, 1.0] # Terminal
163
+ ],
164
+ # Attacker stops
165
+ [
166
+ [0.0, 1.0, 0.0], # No Intrusion
167
+ [0.0, 0.0, 1.0], # Intrusion
168
+ [0.0, 0.0, 1.0] # Terminal
169
+ ]
170
+ ]
171
+ ]
172
+ T_l.append(T)
173
+ return np.array(T_l)
174
+
175
+ @staticmethod
176
+ def observation_tensor(n):
177
+ """
178
+ :return: a |A1|x|A2|x|S|x|O| tensor
179
+ """
180
+ intrusion_dist = []
181
+ no_intrusion_dist = []
182
+ terminal_dist = np.zeros(n + 1)
183
+ terminal_dist[-1] = 1
184
+ intrusion_rv = betabinom(n=n, a=1, b=0.7)
185
+ no_intrusion_rv = betabinom(n=n, a=0.7, b=3)
186
+ for i in range(n + 1):
187
+ intrusion_dist.append(intrusion_rv.pmf(i))
188
+ no_intrusion_dist.append(no_intrusion_rv.pmf(i))
189
+ Z = np.array(
190
+ [
191
+ [
192
+ [
193
+ no_intrusion_dist,
194
+ intrusion_dist,
195
+ terminal_dist
196
+ ],
197
+ [
198
+ no_intrusion_dist,
199
+ intrusion_dist,
200
+ terminal_dist
201
+ ],
202
+ ],
203
+ [
204
+ [
205
+ no_intrusion_dist,
206
+ intrusion_dist,
207
+ terminal_dist
208
+ ],
209
+ [
210
+ no_intrusion_dist,
211
+ intrusion_dist,
212
+ terminal_dist
213
+ ],
214
+ ]
215
+ ]
216
+ )
217
+ return Z
218
+
219
+ @staticmethod
220
+ def sample_next_state(T: npt.NDArray[Any], l: int, s: int, a1: int, a2: int, S: npt.NDArray[np.int32]) -> int:
221
+ """
222
+ Samples the next state
223
+
224
+ :param T: the transition operator
225
+ :param s: the currrent state
226
+ :param a1: the defender action
227
+ :param a2: the attacker action
228
+ :param S: the state space
229
+ :param l: the number of stops remaining
230
+ :return: s'
231
+ """
232
+ state_probs = []
233
+ for s_prime in S:
234
+ state_probs.append(T[l - 1][a1][a2][s][s_prime])
235
+ return int(np.random.choice(np.arange(0, len(S)), p=state_probs))
236
+
237
+ @staticmethod
238
+ def sample_initial_state(b1: npt.NDArray[np.float64]) -> int:
239
+ """
240
+ Samples the initial state
241
+
242
+ :param b1: the initial belief
243
+ :return: s1
244
+ """
245
+ return int(np.random.choice(np.arange(0, len(b1)), p=b1))
246
+
247
+ @staticmethod
248
+ def sample_next_observation(Z: npt.NDArray[Any], s_prime: int, O: npt.NDArray[np.int32]) -> int:
249
+ """
250
+ Samples the next observation
251
+
252
+ :param Z: observation tensor which include the observation probables
253
+ :param s_prime: the new state
254
+ :param O: the observation space
255
+ :return: o
256
+ """
257
+ observation_probs = []
258
+ for o in O:
259
+ if len(Z.shape) == 4:
260
+ observation_probs.append(Z[0][0][s_prime][o])
261
+ elif len(Z.shape) == 3:
262
+ observation_probs.append(Z[0][s_prime][o])
263
+ elif len(Z.shape) == 2:
264
+ observation_probs.append(Z[s_prime][o])
265
+ o = np.random.choice(np.arange(0, len(O)), p=observation_probs)
266
+ return int(o)
267
+
268
+ @staticmethod
269
+ def bayes_filter(s_prime: int, o: int, a1: int, b: npt.NDArray[np.float64], pi2: npt.NDArray[Any], l: int,
270
+ config: StoppingGameConfig) -> float:
271
+ """
272
+ A Bayesian filter to compute the belief of player 1
273
+ of being in s_prime when observing o after taking action a in belief b given that the opponent follows
274
+ strategy pi2
275
+
276
+ :param s_prime: the state to compute the belief of
277
+ :param o: the observation
278
+ :param a1: the action of player 1
279
+ :param b: the current belief point
280
+ :param pi2: the policy of player 2
281
+ :param l: stops remaining
282
+ :return: b_prime(s_prime)
283
+ """
284
+ l = l - 1
285
+ norm = 0
286
+ for s in config.S:
287
+ for a2 in config.A2:
288
+ for s_prime_1 in config.S:
289
+ prob_1 = config.Z[a1][a2][s_prime_1][o]
290
+ norm += b[s] * prob_1 * config.T[l][a1][a2][s][s_prime_1] * pi2[s][a2]
291
+ if norm == 0:
292
+ return 0
293
+ temp = 0
294
+
295
+ for s in config.S:
296
+ for a2 in config.A2:
297
+ temp += config.Z[a1][a2][s_prime][o] * config.T[l][a1][a2][s][s_prime] * b[s] * pi2[s][a2]
298
+ b_prime_s_prime = temp / norm
299
+ if round(b_prime_s_prime, 2) > 1:
300
+ print(f"b_prime_s_prime >= 1: {b_prime_s_prime}, a1:{a1}, s_prime:{s_prime}, l:{l}, o:{o}, pi2:{pi2}")
301
+ assert round(b_prime_s_prime, 2) <= 1
302
+ if s_prime == 2 and o != config.O[-1]:
303
+ assert round(b_prime_s_prime, 2) <= 0.01
304
+ return float(b_prime_s_prime)
305
+
306
+ @staticmethod
307
+ def next_belief(o: int, a1: int, b: npt.NDArray[np.float64], pi2: npt.NDArray[Any],
308
+ config: StoppingGameConfig, l: int, a2: int = 0, s: int = 0) -> npt.NDArray[np.float64]:
309
+ """
310
+ Computes the next belief using a Bayesian filter
311
+
312
+ :param o: the latest observation
313
+ :param a1: the latest action of player 1
314
+ :param b: the current belief
315
+ :param pi2: the policy of player 2
316
+ :param config: the game config
317
+ :param l: stops remaining
318
+ :param a2: the attacker action (for debugging, should be consistent with pi2)
319
+ :param s: the true state (for debugging)
320
+ :return: the new belief
321
+ """
322
+ b_prime = np.zeros(len(config.S))
323
+ for s_prime in config.S:
324
+ b_prime[s_prime] = StoppingGameUtil.bayes_filter(s_prime=s_prime, o=o, a1=a1, b=b,
325
+ pi2=pi2, config=config, l=l)
326
+ if round(sum(b_prime), 2) != 1:
327
+ print(f"error, b_prime:{b_prime}, o:{o}, a1:{a1}, b:{b}, pi2:{pi2}, "
328
+ f"a2: {a2}, s:{s}")
329
+ assert round(sum(b_prime), 2) == 1
330
+ return b_prime
331
+
332
+ @staticmethod
333
+ def sample_attacker_action(pi2: npt.NDArray[Any], s: int) -> int:
334
+ """
335
+ Samples the attacker action
336
+
337
+ :param pi2: the attacker policy
338
+ :param s: the game state
339
+ :return: a2 is the attacker action
340
+ """
341
+ return int(np.random.choice(np.arange(0, len(pi2[s])), p=pi2[s]))
342
+
343
+ @staticmethod
344
+ def pomdp_solver_file(config: StoppingGameConfig, discount_factor: float, pi2: npt.NDArray[Any]) -> str:
345
+ """
346
+ Gets the POMDP environment specification based on the format at http://www.pomdp.org/code/index.html,
347
+ for the defender's local problem against a static attacker
348
+
349
+ :param config: the POMDP config
350
+ :param discount_factor: the discount factor
351
+ :param pi2: the attacker strategy
352
+ :return: the file content as a string
353
+ """
354
+ file_str = ""
355
+ file_str = file_str + f"discount: {discount_factor}\n\n"
356
+ file_str = file_str + "values: reward\n\n"
357
+ file_str = file_str + f"states: {len(config.S)}\n\n"
358
+ file_str = file_str + f"actions: {len(config.A1)}\n\n"
359
+ file_str = file_str + f"observations: {len(config.O)}\n\n"
360
+ initial_belief_str = " ".join(list(map(lambda x: str(x), config.b1)))
361
+ file_str = file_str + f"start: {initial_belief_str}\n\n\n"
362
+ num_transitions = 0
363
+ for s in config.S:
364
+ for a1 in config.A1:
365
+ probs = []
366
+ for s_prime in range(len(config.S)):
367
+ num_transitions += 1
368
+ prob = 0
369
+ for a2 in config.A2:
370
+ prob += config.T[0][a1][a2][s][s_prime] * pi2[s][a2]
371
+ file_str = file_str + f"T: {a1} : {s} : {s_prime} {prob:.80f}\n"
372
+ probs.append(prob)
373
+ assert round(sum(probs), 3) == 1
374
+ file_str = file_str + "\n\n"
375
+ for a1 in config.A1:
376
+ for s_prime in config.S:
377
+ probs = []
378
+ for o in range(len(config.O)):
379
+ prob = config.Z[0][0][s_prime][o]
380
+ file_str = file_str + f"O : {a1} : {s_prime} : {o} {prob:.80f}\n"
381
+ probs.append(prob)
382
+ assert round(sum(probs), 3) == 1
383
+ file_str = file_str + "\n\n"
384
+ for s in config.S:
385
+ for a1 in config.A1:
386
+ for s_prime in config.S:
387
+ for o in config.O:
388
+ r = config.R[0][a1][0][s]
389
+ file_str = file_str + f"R: {a1} : {s} : {s_prime} : {o} {r:.80f}\n"
390
+ return file_str
391
+
392
+ @staticmethod
393
+ def reduce_T_attacker(T: npt.NDArray[np.float64], strategy: Policy) -> npt.NDArray[np.float64]:
394
+ """
395
+ Reduces the transition tensor based on a given attacker strategy
396
+
397
+ :param T: the tensor to reduce
398
+ :param strategy: the strategy to use for the reduction
399
+ :return: the reduced tensor (|A1|x|S|x|S|)
400
+ """
401
+ if len(T.shape) == 5:
402
+ T = T[0]
403
+ reduced_T = np.zeros((T.shape[0], T.shape[2], T.shape[3]))
404
+ for i in range(T.shape[0]):
405
+ for j in range(T.shape[2]):
406
+ for k in range(T.shape[3]):
407
+ reduced_T[i][j][k] = T[i][0][j][k] * strategy.probability(j, 0) + T[i][1][j][
408
+ k] * strategy.probability(j, 1)
409
+ # if j == 0:
410
+ # reduced_T[i][j][k] = T[i][0][j][k] * strategy.probability(j, 0) + T[i][1][j][
411
+ # k] * strategy.probability(j, 1)
412
+ # else:
413
+ # reduced_T[i][j][k] = (T[i][0][j][k] * (1 - strategy.probability(j, 0)) + T[i][1][j][k] *
414
+ # strategy.probability(j, 1))
415
+ return reduced_T
416
+
417
+ @staticmethod
418
+ def reduce_R_attacker(R: npt.NDArray[np.float64], strategy: Policy) -> npt.NDArray[np.float64]:
419
+ """
420
+ Reduces the reward tensor based on a given attacker strategy
421
+
422
+ :param R: the reward tensor to reduce
423
+ :param strategy: the strategy to use for the reduction
424
+ :return: the reduced reward tensor (|A1|x|S|)
425
+ """
426
+ if len(R.shape) == 4:
427
+ R = R[0]
428
+ reduced_R = np.zeros((R.shape[0], R.shape[2]))
429
+ for i in range(R.shape[0]):
430
+ for j in range(R.shape[2]):
431
+ reduced_R[i][j] = (R[i][0][j] * strategy.probability(j, 0) + R[i][1][j] *
432
+ strategy.probability(j, 1))
433
+ return reduced_R
434
+
435
+ @staticmethod
436
+ def reduce_Z_attacker(Z: npt.NDArray[np.float64], strategy: Policy) -> npt.NDArray[np.float64]:
437
+ """
438
+ Reduces the observation tensor based on a given attacker strategy
439
+
440
+ :param Z: the observation tensor to reduce
441
+ :param strategy: the strategy to use for the reduction
442
+ :return: the reduced observation tensor (|A1|x|S|x|O|)
443
+ """
444
+ reduced_Z = np.zeros((Z.shape[0], Z.shape[2], Z.shape[3]))
445
+ for i in range(Z.shape[0]):
446
+ for j in range(Z.shape[2]):
447
+ for k in range(Z.shape[3]):
448
+ reduced_Z[i][j][k] = Z[i][0][j][k] * strategy.probability(j, 0) + Z[i][1][j][
449
+ k] * strategy.probability(j, 1)
450
+ return reduced_Z
451
+
452
+ @staticmethod
453
+ def reduce_T_defender(T: npt.NDArray[np.float64], strategy: Policy) -> npt.NDArray[np.float64]:
454
+ """
455
+ Reduces the transition tensor based on a given defender strategy
456
+
457
+ :param T: the tensor to reduce
458
+ :param strategy: the strategy to use for the reduction
459
+ :return: the reduced tensor (|A2|x|S|x|S|)
460
+ """
461
+ if len(T.shape) == 5:
462
+ T = T[0]
463
+ reduced_T = np.zeros((T.shape[1], T.shape[2], T.shape[3]))
464
+ for i in range(T.shape[1]):
465
+ for j in range(T.shape[2]):
466
+ for k in range(T.shape[3]):
467
+ reduced_T[i][j][k] = (T[0][i][j][k] * strategy.probability(j, 0) + T[1][i][j][k]
468
+ * strategy.probability(j, 1))
469
+ return reduced_T
470
+
471
+ @staticmethod
472
+ def reduce_R_defender(R: npt.NDArray[np.float64], strategy: Policy) -> npt.NDArray[np.float64]:
473
+ """
474
+ Reduces the reward tensor based on a given defender strategy
475
+
476
+ :param R: the reward tensor to reduce
477
+ :param strategy: the strategy to use for the reduction
478
+ :return: the reduced reward tensor (|A2|x|S|)
479
+ """
480
+ if len(R.shape) == 4:
481
+ R = R[0]
482
+ reduced_R = np.zeros((R.shape[1], R.shape[2]))
483
+ for i in range(R.shape[1]):
484
+ for j in range(R.shape[2]):
485
+ reduced_R[i][j] = (R[0][i][j] * strategy.probability(j, 0) + R[1][i][j] *
486
+ strategy.probability(j, 1))
487
+ return reduced_R
488
+
489
+ @staticmethod
490
+ def aggregate_belief_mdp_defender(aggregation_resolution: int, T: npt.NDArray[np.float64],
491
+ R: npt.NDArray[np.float64], Z: npt.NDArray[np.float64],
492
+ S: npt.NDArray[np.int32], A: npt.NDArray[np.int32], O: npt.NDArray[np.int32]) \
493
+ -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.int32], npt.NDArray[np.float64], npt.NDArray[np.float64]]:
494
+ """
495
+ Generates an aggregate belief MDP from a given POMDP specification and aggregation resolution
496
+
497
+ :param aggregation_resolution: the belief aggregation resolution
498
+ :param T: the transition tensor of the POMDP
499
+ :param R: the reward tensor of the POMDP
500
+ :param Z: the observation tensor of the POMDP
501
+ :param S: the state space of the POMDP
502
+ :param A: the action space of the POMDP
503
+ :param O: the observation space of the POMDP
504
+ :return: the state space, action space, transition operator, and belief operator of the belief MDP
505
+ """
506
+ aggregate_belief_space = StoppingGameUtil.generate_aggregate_belief_space(
507
+ n=aggregation_resolution, belief_space_dimension=len(S))
508
+ belief_T = StoppingGameUtil.generate_aggregate_belief_transition_operator(
509
+ aggregate_belief_space=aggregate_belief_space, S=S, A=A, O=O, T=T, Z=Z)
510
+ belief_R = StoppingGameUtil.generate_aggregate_belief_reward_tensor(
511
+ aggregate_belief_space=aggregate_belief_space, S=S, A=A, R=R)
512
+ return aggregate_belief_space, A, belief_T, belief_R
513
+
514
+ @staticmethod
515
+ def generate_aggregate_belief_space(n: int, belief_space_dimension: int) -> npt.NDArray[np.float64]:
516
+ """
517
+ Generate an aggregate belief space B_n.
518
+
519
+ :param n: the aggregation resolution
520
+ :param belief_space_dimension: the belief space dimension
521
+ :return: the aggregate belief space
522
+ """
523
+
524
+ # Generate all combinations of integer allocations k_i such that sum(k_i) = n
525
+ combinations = [k for k in itertools.product(range(n + 1), repeat=belief_space_dimension) if sum(k) == n]
526
+
527
+ # Convert integer allocations to belief points by dividing each k_i by n
528
+ belief_points = [list(k_i / n for k_i in k) for k in combinations]
529
+
530
+ # Remove all beliefs that violate the stopping dynamics
531
+ belief_points = list(filter(lambda x: x[-1] == 1.0 or x[-1] == 0.0, belief_points))
532
+
533
+ return np.array(belief_points)
534
+
535
+ @staticmethod
536
+ def generate_aggregate_belief_reward_tensor(
537
+ aggregate_belief_space: npt.NDArray[np.float64], S: npt.NDArray[np.int32], A: npt.NDArray[np.int32],
538
+ R: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
539
+ """
540
+ Generates an aggregate reward tensor for the aggregate belief MDP
541
+
542
+ :param aggregate_belief_space: the aggregate belief space
543
+ :param S: the state space of the POMDP
544
+ :param A: the action space of the POMDP
545
+ :param R: the reward tensor of the POMDP
546
+ :return: the reward tensor of the aggregate belief MDP
547
+ """
548
+ belief_R = np.zeros((len(A), len(aggregate_belief_space)))
549
+ belief_space_list = aggregate_belief_space.tolist()
550
+ for a in A:
551
+ for b in aggregate_belief_space:
552
+ expected_reward = 0
553
+ for s in S:
554
+ expected_reward += R[a][s] * b[s]
555
+ belief_R[a][belief_space_list.index(b.tolist())] = expected_reward
556
+ return belief_R
557
+
558
+ @staticmethod
559
+ def generate_aggregate_belief_transition_operator(
560
+ aggregate_belief_space: npt.NDArray[np.float64], S: npt.NDArray[np.int32], A: npt.NDArray[np.int32],
561
+ O: npt.NDArray[np.int32], T: npt.NDArray[np.float64], Z: npt.NDArray[np.float64]) \
562
+ -> npt.NDArray[np.float64]:
563
+ """
564
+ Generates an aggregate belief space transition operator
565
+
566
+ :param aggregate_belief_space: the aggregate belief space
567
+ :param O: the observation space of the POMDP
568
+ :param S: the state space of the POMDP
569
+ :param A: the action space of the POMDP
570
+ :param T: the transition operator of the POMDP
571
+ :param Z: the observation tensor of the POMDP
572
+ :return: the aggregate belief space operator
573
+ """
574
+ belief_space_list = aggregate_belief_space.tolist()
575
+ belief_T = np.zeros((len(A), len(aggregate_belief_space), len(aggregate_belief_space)))
576
+ for a in A:
577
+ for b1 in aggregate_belief_space:
578
+ for b2 in aggregate_belief_space:
579
+ belief_T[a][belief_space_list.index(b1.tolist())][belief_space_list.index(b2.tolist())] \
580
+ = StoppingGameUtil.aggregate_belief_transition_probability(
581
+ b1=b1, b2=b2, a=a, S=S, O=O, T=T, Z=Z, aggregate_belief_space=aggregate_belief_space, A=A)
582
+ return belief_T
583
+
584
+ @staticmethod
585
+ def aggregate_belief_transition_probability(b1: npt.NDArray[np.float64], b2: npt.NDArray[np.float64], a: int,
586
+ S: npt.NDArray[np.int32], O: npt.NDArray[np.int32],
587
+ A: npt.NDArray[np.int32],
588
+ T: npt.NDArray[np.float64], Z: npt.NDArray[np.float64],
589
+ aggregate_belief_space: npt.NDArray[np.float64]) -> float:
590
+ """
591
+ Calculates the probability of transitioning from belief b1 to belief b2 when taking action a
592
+
593
+ :param b1: the source belief
594
+ :param b2: the target belief
595
+ :param a: the action
596
+ :param S: the state space of the POMDP
597
+ :param O: the observation space of the POMDP
598
+ :param A: the action space of the POMDP
599
+ :param T: the transition operator
600
+ :param Z: the observation tensor
601
+ :param aggregate_belief_space: the aggregate belief space
602
+ :return: the probability P(b2 | b1, a)
603
+ """
604
+ prob = 0
605
+ for o in O:
606
+ if sum([Z[a][s_prime][o] * b1[s] * T[a][s][s_prime] for s in S for s_prime in S]) == 0:
607
+ continue
608
+ b_prime = StoppingGameUtil.pomdp_next_belief(
609
+ o=o, a=a, b=b1, states=S, observations=O, observation_tensor=Z, transition_tensor=T)
610
+ nearest_neighbor = StoppingGameUtil.find_nearest_neighbor_belief(belief_space=aggregate_belief_space,
611
+ target_belief=b_prime)
612
+ if np.array_equal(nearest_neighbor, b2):
613
+ for s in S:
614
+ for s_prime in S:
615
+ prob += Z[a][s_prime][o] * b1[s] * T[a][s][s_prime]
616
+ return prob
617
+
618
+ @staticmethod
619
+ def pomdp_next_belief(o: int, a: int, b: npt.NDArray[np.float64], states: npt.NDArray[np.int32],
620
+ observations: npt.NDArray[np.int32], observation_tensor: npt.NDArray[np.float64],
621
+ transition_tensor: npt.NDArray[np.float64]) \
622
+ -> npt.NDArray[np.float64]:
623
+ """
624
+ Computes the next belief of the POMDP using a Bayesian filter
625
+
626
+ :param o: the latest observation
627
+ :param a: the latest action of player 1
628
+ :param b: the current belief
629
+ :param states: the list of states
630
+ :param observations: the list of observations
631
+ :param observation_tensor: the observation tensor
632
+ :param transition_tensor: the transition tensor
633
+ :return: the new belief
634
+ """
635
+ b_prime = [0.0] * len(states)
636
+ for s_prime in states:
637
+ b_prime[s_prime] = StoppingGameUtil.pomdp_bayes_filter(
638
+ s_prime=s_prime, o=o, a=a, b=b, states=states, observations=observations,
639
+ transition_tensor=transition_tensor, observation_tensor=observation_tensor)
640
+ if round(sum(b_prime), 2) != 1:
641
+ print(f"error, b_prime:{b_prime}, o:{o}, a:{a}, b:{b}")
642
+ assert round(sum(b_prime), 2) == 1
643
+ return np.array(b_prime)
644
+
645
+ @staticmethod
646
+ def pomdp_bayes_filter(s_prime: int, o: int, a: int, b: npt.NDArray[np.float64], states: npt.NDArray[np.int32],
647
+ observations: npt.NDArray[np.int32], observation_tensor: npt.NDArray[np.float64],
648
+ transition_tensor: npt.NDArray[np.float64]) -> float:
649
+ """
650
+ A Bayesian filter to compute b[s_prime] of the POMDP
651
+
652
+ :param s_prime: the state to compute the belief for
653
+ :param o: the latest observation
654
+ :param a: the latest action
655
+ :param b: the current belief
656
+ :param states: the list of states
657
+ :param observations: the list of observations
658
+ :param observation_tensor: the observation tensor
659
+ :param transition_tensor: the transition tensor of the POMDP
660
+ :return: b[s_prime]
661
+ """
662
+ norm = 0.0
663
+ for s in states:
664
+ for s_prime_1 in states:
665
+ prob_1 = observation_tensor[a][s_prime_1][o]
666
+ norm += b[s] * prob_1 * transition_tensor[a][s][s_prime_1]
667
+ if norm == 0.0:
668
+ print(f"zero norm, a: {a}, b: {b}, o: {o}")
669
+ return 0.0
670
+ temp = 0.0
671
+
672
+ for s in states:
673
+ temp += observation_tensor[a][s_prime][o] * transition_tensor[a][s][s_prime] * b[s]
674
+ b_prime_s_prime = temp / norm
675
+ if round(b_prime_s_prime, 2) > 1:
676
+ print(f"b_prime_s_prime >= 1: {b_prime_s_prime}, a1:{a}, s_prime:{s_prime}")
677
+ assert round(b_prime_s_prime, 2) <= 1
678
+ if s_prime == 2 and o != observations[-1]:
679
+ assert round(b_prime_s_prime, 2) <= 0.01
680
+ return b_prime_s_prime
681
+
682
+ @staticmethod
683
+ def find_nearest_neighbor_belief(belief_space: npt.NDArray[np.float64], target_belief: npt.NDArray[np.float64]) \
684
+ -> npt.NDArray[np.float64]:
685
+ """
686
+ Finds the nearest neighbor (in the Euclidean sense) of a given belief in a certain belief space
687
+
688
+ :param belief_space: the belief to search from
689
+ :param target_belief: the belief to find the nearest neighbor of
690
+ :return: the nearest neighbor belief from the belief space
691
+ """
692
+
693
+ # Compute Euclidean distances between the target belief and all points in the belief space
694
+ distances = np.linalg.norm(belief_space - target_belief, axis=1)
695
+
696
+ # Find the index of the minimum distance (break ties consistently by choosing the smallest index)
697
+ nearest_index = int(np.argmin(distances))
698
+
699
+ return np.array(belief_space[nearest_index])