noregret 0.0.0.dev0__tar.gz → 0.0.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: noregret
3
- Version: 0.0.0.dev0
3
+ Version: 0.0.0.dev2
4
4
  Summary: No-regret learning dynamics
5
5
  Home-page: https://github.com/uoftcprg/noregret
6
6
  Author: Universal, Open, Free, and Transparent Computer Poker Research Group
@@ -1,10 +1,12 @@
1
1
  from abc import ABC, abstractmethod
2
- from dataclasses import dataclass, field
3
- from functools import partial
4
- from itertools import count, permutations
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from functools import cache, partial
5
+ from itertools import permutations
5
6
  from math import factorial
6
7
  from typing import Any
7
8
 
9
+ from ordered_set import OrderedSet
8
10
  from scipy.sparse import lil_array
9
11
  import numpy as np
10
12
 
@@ -72,8 +74,11 @@ class Game(ABC):
72
74
 
73
75
  for i, value in enumerate(self.values(*strategies)):
74
76
  opponent_strategies = strategies[:i] + strategies[i + 1:]
75
- _, br_value = self.best_response(i, *opponent_strategies)
76
- gap += br_value - value
77
+ _, best_response_value = self.best_response(
78
+ i,
79
+ *opponent_strategies,
80
+ )
81
+ gap += best_response_value - value
77
82
 
78
83
  return gap
79
84
 
@@ -85,8 +90,11 @@ class Game(ABC):
85
90
  average_opponent_strategies = (
86
91
  average_strategies[:i] + average_strategies[i + 1:]
87
92
  )
88
- _, br_value = self.best_response(i, *average_opponent_strategies)
89
- gap += br_value - value
93
+ _, best_response_value = self.best_response(
94
+ i,
95
+ *average_opponent_strategies,
96
+ )
97
+ gap += best_response_value - value
90
98
 
91
99
  return gap
92
100
 
@@ -255,13 +263,11 @@ class NormalFormGame(Serializable, Game):
255
263
 
256
264
  actions: Any
257
265
  utilities: Any
258
- indices: Any = field(init=False, default_factory=list)
259
266
 
260
267
  def __post_init__(self):
261
268
  super().__post_init__()
262
269
 
263
- for i, actions in enumerate(self.actions):
264
- self.indices.append(dict(zip(actions, count())))
270
+ self.actions = tuple(map(OrderedSet, self.actions))
265
271
 
266
272
  def _verify(self, *, utilities_shape=None, **kwargs):
267
273
  super()._verify(**kwargs)
@@ -304,14 +310,6 @@ class TwoPlayerNormalFormGame(TwoPlayerGame, NormalFormGame):
304
310
  def column_actions(self):
305
311
  return self.actions[1]
306
312
 
307
- @property
308
- def row_indices(self):
309
- return self.indices[0]
310
-
311
- @property
312
- def column_indices(self):
313
- return self.indices[1]
314
-
315
313
  @property
316
314
  def row_utilities(self):
317
315
  return self.utilities[:, :, 0]
@@ -414,7 +412,7 @@ class TwoPlayerExtensiveFormGame(TwoPlayerGame, ExtensiveFormGame):
414
412
  for tfsdp, sequence in zip(tfsdps, raw_utility['sequences']):
415
413
  sequence = tuple(sequence)
416
414
 
417
- indices.append(tfsdp.indices[sequence])
415
+ indices.append(tfsdp.sequences.index(sequence))
418
416
 
419
417
  indices = tuple(indices)
420
418
  row_utilities[indices] = raw_utility['values'][0]
@@ -448,14 +446,6 @@ class TwoPlayerExtensiveFormGame(TwoPlayerGame, ExtensiveFormGame):
448
446
  def column_sequences(self):
449
447
  return self.column_tree_form_sequential_decision_process.sequences
450
448
 
451
- @property
452
- def row_indices(self):
453
- return self.row_tree_form_sequential_decision_process.indices
454
-
455
- @property
456
- def column_indices(self):
457
- return self.column_tree_form_sequential_decision_process.indices
458
-
459
449
  @property
460
450
  def row_utilities(self):
461
451
  return self.utilities[0]
@@ -530,7 +520,7 @@ class TwoPlayerZeroSumExtensiveFormGame(
530
520
  for tfsdp, sequence in zip(tfsdps, raw_utility['sequences']):
531
521
  sequence = tuple(sequence)
532
522
 
533
- indices.append(tfsdp.indices[sequence])
523
+ indices.append(tfsdp.sequences.index(sequence))
534
524
 
535
525
  indices = tuple(indices)
536
526
  utilities[indices] = raw_utility['value']
@@ -630,3 +620,169 @@ class SymmetrizedGame(Game):
630
620
 
631
621
  def best_response(self, player, *opponent_strategies):
632
622
  raise NotImplementedError
623
+
624
+
625
+ class ExtensiveFormGame2(ABC):
626
+ """Extensive-form game (EFG)."""
627
+
628
+ @dataclass(frozen=True)
629
+ class State:
630
+ """State of an extensive-form game."""
631
+
632
+ @property
633
+ @abstractmethod
634
+ def utilities(self):
635
+ pass
636
+
637
+ @property
638
+ @abstractmethod
639
+ def chance_action_probabilities(self):
640
+ pass
641
+
642
+ @property
643
+ @abstractmethod
644
+ def actions(self):
645
+ pass
646
+
647
+ @property
648
+ @abstractmethod
649
+ def infoset(self):
650
+ pass
651
+
652
+ @property
653
+ @abstractmethod
654
+ def player(self):
655
+ pass
656
+
657
+ @abstractmethod
658
+ def is_terminal(self):
659
+ pass
660
+
661
+ @abstractmethod
662
+ def is_chance(self):
663
+ pass
664
+
665
+ @abstractmethod
666
+ def utility(self, player):
667
+ pass
668
+
669
+ @abstractmethod
670
+ def apply(self, action):
671
+ pass
672
+
673
+ @property
674
+ @abstractmethod
675
+ def players(self):
676
+ pass
677
+
678
+ @property
679
+ @abstractmethod
680
+ def initial_state(self):
681
+ pass
682
+
683
+ def values(self, strategy_profile, state=None):
684
+ if state is None:
685
+ values = self.values(strategy_profile, self.initial_state)
686
+ elif state.is_terminal():
687
+ values = state.utilities
688
+ else:
689
+ if state.is_chance():
690
+ actions, probabilities = zip(
691
+ *state.chance_action_probabilities,
692
+ )
693
+ else:
694
+ actions = state.actions
695
+ probabilities = strategy_profile(state)
696
+
697
+ values = 0
698
+
699
+ for action, probability in zip(actions, probabilities):
700
+ values += (
701
+ probability
702
+ * self.values(strategy_profile, state.apply(action))
703
+ )
704
+
705
+ return values
706
+
707
+ def best_response_value(self, player, strategy_profile):
708
+ states = defaultdict(list)
709
+ counterfactual_reach_probabilities = {}
710
+
711
+ def dfs(state, counterfactual_reach_probability):
712
+ counterfactual_reach_probabilities[state] = (
713
+ counterfactual_reach_probability
714
+ )
715
+
716
+ if state.is_terminal():
717
+ return
718
+
719
+ if not state.is_chance():
720
+ states[state.infoset].append(state)
721
+
722
+ if state.is_chance() or state.player != player:
723
+ if state.is_chance():
724
+ actions, probabilities = zip(
725
+ *state.chance_action_probabilities,
726
+ )
727
+ else:
728
+ actions = state.actions
729
+ probabilities = strategy_profile(state)
730
+
731
+ for action, probability in zip(actions, probabilities):
732
+ dfs(
733
+ state.apply(action),
734
+ probability * counterfactual_reach_probability,
735
+ )
736
+ else:
737
+ for action in state.actions:
738
+ dfs(state.apply(action), counterfactual_reach_probability)
739
+
740
+ dfs(self.initial_state, 1)
741
+
742
+ @cache
743
+ def solve(state):
744
+ if state.is_terminal():
745
+ value = state.utility(player)
746
+ elif state.is_chance() or state.player != player:
747
+ if state.is_chance():
748
+ actions, probabilities = zip(
749
+ *state.chance_action_probabilities,
750
+ )
751
+ else:
752
+ actions = state.actions
753
+ probabilities = strategy_profile(state)
754
+
755
+ value = 0
756
+
757
+ for action, probability in zip(actions, probabilities):
758
+ value += probability * solve(state.apply(action))
759
+ else:
760
+ value = solve2(state.infoset)
761
+
762
+ return value
763
+
764
+ @cache
765
+ def solve2(infoset):
766
+ values = defaultdict(int)
767
+
768
+ for state in states[infoset]:
769
+ weight = counterfactual_reach_probabilities[state]
770
+
771
+ for i, action in enumerate(state.actions):
772
+ values[i] += weight * solve(state.apply(action))
773
+
774
+ return max(values.values())
775
+
776
+ return solve(self.initial_state)
777
+
778
+ def nash_gap(self, strategy_profile):
779
+ gap = 0
780
+
781
+ for player, value in zip(self.players, self.values(strategy_profile)):
782
+ best_response_value = self.best_response_value(
783
+ player,
784
+ strategy_profile,
785
+ )
786
+ gap += best_response_value - value
787
+
788
+ return gap
@@ -10,6 +10,7 @@ import numpy as np
10
10
 
11
11
  from noregret.utilities import (
12
12
  euclidean_projection_on_probability_simplex,
13
+ sample,
13
14
  split,
14
15
  stationary_distribution,
15
16
  )
@@ -52,6 +53,9 @@ class RegretMinimizer(ABC):
52
53
  def next_strategy(self, prediction=False):
53
54
  pass
54
55
 
56
+ def undo_next_strategy(self):
57
+ self.strategies.pop()
58
+
55
59
  def observe_utility(self, utility):
56
60
  if len(self.strategies) == len(self.utilities):
57
61
  raise ValueError('next strategy not yet outputted')
@@ -384,12 +388,15 @@ class BlumMansour(ProbabilitySimplexSwapRegretMinimizer):
384
388
  self.previous_strategy[a] * prediction,
385
389
  )
386
390
 
387
- strategy = stationary_distribution(self.outputs.T)
391
+ strategy = stationary_distribution(self.outputs)
388
392
 
389
393
  self.strategies.append(strategy)
390
394
 
391
395
  return strategy
392
396
 
397
+ def undo_next_strategy(self):
398
+ raise NotImplementedError
399
+
393
400
  def observe_utility(self, utility):
394
401
  super().observe_utility(utility)
395
402
 
@@ -471,6 +478,9 @@ class CounterfactualRegretMinimization(SequenceFormPolytopeRegretMinimizer):
471
478
 
472
479
  return strategy
473
480
 
481
+ def undo_next_strategy(self):
482
+ raise NotImplementedError
483
+
474
484
  def observe_utility(self, utility):
475
485
  super().observe_utility(utility)
476
486
 
@@ -568,6 +578,9 @@ class CartesianProductRegretCircuit(RegretCircuit):
568
578
 
569
579
  return strategy
570
580
 
581
+ def undo_next_strategy(self):
582
+ raise NotImplementedError
583
+
571
584
  def observe_utility(self, utility):
572
585
  super().observe_utility(utility)
573
586
 
@@ -634,6 +647,9 @@ class ConvexHullRegretCircuit(RegretCircuit):
634
647
 
635
648
  return strategy
636
649
 
650
+ def undo_next_strategy(self):
651
+ raise NotImplementedError
652
+
637
653
  def observe_utility(self, utility):
638
654
  super().observe_utility(utility)
639
655
 
@@ -643,3 +659,152 @@ class ConvexHullRegretCircuit(RegretCircuit):
643
659
  self.previous_outputs = self.outputs.copy()
644
660
 
645
661
  self.mixing_regret_minimizer.observe_utility(self.outputs @ utility)
662
+
663
+
664
+ @dataclass
665
+ class StochasticRegretMinimization(ABC):
666
+ """Stochastic regret minimization."""
667
+
668
+ extensive_form_game: Any
669
+
670
+ @property
671
+ def average_strategy_profile(self):
672
+ return lambda state: (
673
+ self._local_regret_minimizer(state).average_strategy
674
+ )
675
+
676
+ @abstractmethod
677
+ def _local_regret_minimizer(self, state):
678
+ pass
679
+
680
+ def external_sampling(self):
681
+ for player in self.extensive_form_game.players:
682
+ self._external_sampling(
683
+ player,
684
+ self.extensive_form_game.initial_state,
685
+ )
686
+
687
+ def _external_sampling(self, player, state):
688
+ if state.is_terminal():
689
+ utility = state.utility(player)
690
+ elif state.is_chance():
691
+ actions, probabilities = zip(*state.chance_action_probabilities)
692
+ action = sample(actions, probabilities)
693
+ utility = self._external_sampling(player, state.apply(action))
694
+ else:
695
+ local_regret_minimizer = self._local_regret_minimizer(state)
696
+ actions = state.actions
697
+ probabilities = local_regret_minimizer.next_strategy()
698
+
699
+ if state.player == player:
700
+ utilities = list(
701
+ map(
702
+ partial(self._external_sampling, player),
703
+ map(state.apply, actions),
704
+ ),
705
+ )
706
+ utility = utilities @ probabilities
707
+
708
+ local_regret_minimizer.observe_utility(utilities)
709
+ else:
710
+ action = sample(actions, probabilities)
711
+ utility = self._external_sampling(player, state.apply(action))
712
+
713
+ local_regret_minimizer.undo_next_strategy()
714
+
715
+ return utility
716
+
717
+ def outcome_sampling(self, reference_strategy_profile):
718
+ for player in self.extensive_form_game.players:
719
+ self._outcome_sampling(
720
+ reference_strategy_profile,
721
+ player,
722
+ self.extensive_form_game.initial_state,
723
+ 1,
724
+ )
725
+
726
+ def _outcome_sampling(
727
+ self,
728
+ reference_strategy_profile,
729
+ player,
730
+ state,
731
+ reference_reach_probability,
732
+ ):
733
+ if state.is_terminal():
734
+ utility = state.utility(player) / reference_reach_probability
735
+ elif state.is_chance():
736
+ actions, probabilities = zip(*state.chance_action_probabilities)
737
+ action = sample(actions, probabilities)
738
+ utility = self._outcome_sampling(
739
+ reference_strategy_profile,
740
+ player,
741
+ state.apply(action),
742
+ reference_reach_probability,
743
+ )
744
+ else:
745
+ local_regret_minimizer = self._local_regret_minimizer(state)
746
+ actions = state.actions
747
+
748
+ if state.player == player:
749
+ probabilities = reference_strategy_profile(state)
750
+ index = sample(range(len(actions)), probabilities)
751
+ action = actions[index]
752
+ probability = probabilities[index]
753
+ utility = (
754
+ probability
755
+ * self._outcome_sampling(
756
+ reference_strategy_profile,
757
+ player,
758
+ state.apply(action),
759
+ probability * reference_reach_probability,
760
+ )
761
+ )
762
+ utilities = np.zeros(len(actions))
763
+ utilities[index] = utility
764
+
765
+ local_regret_minimizer.next_strategy()
766
+ local_regret_minimizer.observe_utility(utilities)
767
+ else:
768
+ probabilities = local_regret_minimizer.next_strategy()
769
+ action = sample(actions, probabilities)
770
+ utility = self._outcome_sampling(
771
+ reference_strategy_profile,
772
+ player,
773
+ state.apply(action),
774
+ reference_reach_probability,
775
+ )
776
+
777
+ local_regret_minimizer.undo_next_strategy()
778
+
779
+ return utility
780
+
781
+
782
+ @dataclass
783
+ class MonteCarloCounterfactualRegretMinimization(StochasticRegretMinimization):
784
+ """Monte Carlo Counterfactual regret minimization (MCCFR)."""
785
+
786
+ regret_minimizer_factory: Any = partial(
787
+ RegretMatching,
788
+ is_time_symmetric=True,
789
+ )
790
+ _: KW_ONLY
791
+ local_regret_minimizers: Any = field(init=False, default_factory=dict)
792
+
793
+ @property
794
+ def iteration_count(self):
795
+ iteration_count = 0
796
+
797
+ for R in self.local_regret_minimizers.values():
798
+ iteration_count += R.iteration_count
799
+
800
+ return iteration_count
801
+
802
+ def _local_regret_minimizer(self, state):
803
+ if state.infoset in self.local_regret_minimizers:
804
+ R = self.local_regret_minimizers[state.infoset]
805
+ else:
806
+ action_count = len(state.actions)
807
+ R = self.regret_minimizer_factory(action_count)
808
+ self.local_regret_minimizers[state.infoset] = R
809
+
810
+ return R
@@ -6,6 +6,7 @@ from functools import partial
6
6
  from importlib import import_module
7
7
  from json import dump, dumps, load, loads
8
8
  from math import inf
9
+ from random import choices
9
10
  from typing import Any
10
11
 
11
12
  from ordered_set import OrderedSet
@@ -45,11 +46,7 @@ def euclidean_projection_on_probability_simplex(input_):
45
46
 
46
47
  def stationary_distribution(stochastic_matrix):
47
48
  P = stochastic_matrix
48
-
49
- if not np.allclose(P.sum(1), 1):
50
- raise ValueError('matrix not stochastic')
51
-
52
- eigenvalues, eigenvectors = LA.eig(P.T)
49
+ eigenvalues, eigenvectors = LA.eig(P)
53
50
  pi = eigenvectors[:, np.isclose(eigenvalues, 1)][:, 0]
54
51
  pi /= pi.sum()
55
52
  pi = pi.real
@@ -96,6 +93,10 @@ def split(values, counts):
96
93
  return splits
97
94
 
98
95
 
96
+ def sample(values, probabilities):
97
+ return choices(values, probabilities)[0]
98
+
99
+
99
100
  class Serializable(ABC):
100
101
  @classmethod
101
102
  @abstractmethod
@@ -153,7 +154,6 @@ class TreeFormSequentialDecisionProcess(Serializable):
153
154
  decision_points: Any = field(init=False, default_factory=OrderedSet)
154
155
  observation_points: Any = field(init=False, default_factory=OrderedSet)
155
156
  sequences: Any = field(init=False, default_factory=OrderedSet)
156
- indices: Any = field(init=False, default_factory=dict)
157
157
  parent_sequences: Any = field(init=False, default_factory=dict)
158
158
  actions: Any = field(
159
159
  init=False,
@@ -203,8 +203,6 @@ class TreeFormSequentialDecisionProcess(Serializable):
203
203
  if is_sequence:
204
204
  self.sequences.add(parent_edge)
205
205
 
206
- self.indices[parent_edge] = len(self.indices)
207
-
208
206
  self.parent_sequences[p] = parent_sequence
209
207
 
210
208
  def behavioral_uniform_strategy(self):
@@ -227,7 +225,7 @@ class TreeFormSequentialDecisionProcess(Serializable):
227
225
 
228
226
  for i, a in enumerate(self.actions[p]):
229
227
  value = (
230
- utility[self.indices[p, a]]
228
+ utility[self.sequences.index((p, a))]
231
229
  + V[self.transitions[p, a]]
232
230
  )
233
231
 
@@ -250,15 +248,15 @@ class TreeFormSequentialDecisionProcess(Serializable):
250
248
 
251
249
  def behavioral_to_sequence_form(self, behavioral_strategy):
252
250
  strategy = np.zeros(len(self.sequences))
253
- strategy[self.indices[()]] = 1
251
+ strategy[self.sequences.index(())] = 1
254
252
 
255
253
  for j in self.decision_points:
256
254
  p_j = self.parent_sequences[j]
257
255
 
258
256
  for i, a in enumerate(self.actions[j]):
259
- strategy[self.indices[j, a]] = (
257
+ strategy[self.sequences.index((j, a))] = (
260
258
  behavioral_strategy[j][i]
261
- * strategy[self.indices[p_j]]
259
+ * strategy[self.sequences.index(p_j)]
262
260
  )
263
261
 
264
262
  return strategy
@@ -273,7 +271,7 @@ class TreeFormSequentialDecisionProcess(Serializable):
273
271
  V[p] += (
274
272
  behavioral_strategy[p][i]
275
273
  * (
276
- utility[self.indices[p, a]]
274
+ utility[self.sequences.index((p, a))]
277
275
  + V[self.transitions[p, a]]
278
276
  )
279
277
  )
@@ -288,7 +286,7 @@ class TreeFormSequentialDecisionProcess(Serializable):
288
286
 
289
287
  for i, a in enumerate(self.actions[j]):
290
288
  utilities[j][i] = (
291
- utility[self.indices[j, a]]
289
+ utility[self.sequences.index((j, a))]
292
290
  + V[self.transitions[j, a]]
293
291
  )
294
292
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: noregret
3
- Version: 0.0.0.dev0
3
+ Version: 0.0.0.dev2
4
4
  Summary: No-regret learning dynamics
5
5
  Home-page: https://github.com/uoftcprg/noregret
6
6
  Author: Universal, Open, Free, and Transparent Computer Poker Research Group
@@ -4,7 +4,7 @@ from setuptools import find_packages, setup
4
4
 
5
5
  setup(
6
6
  name='noregret',
7
- version='0.0.0.dev0',
7
+ version='0.0.0.dev2',
8
8
  description='No-regret learning dynamics',
9
9
  long_description=open('README.md').read(),
10
10
  long_description_content_type='text/markdown',
File without changes
File without changes
File without changes