noregret 0.0.0.dev1__tar.gz → 0.0.0.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: noregret
3
- Version: 0.0.0.dev1
3
+ Version: 0.0.0.dev3
4
4
  Summary: No-regret learning dynamics
5
5
  Home-page: https://github.com/uoftcprg/noregret
6
6
  Author: Universal, Open, Free, and Transparent Computer Poker Research Group
@@ -1,6 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
+ from collections import defaultdict
2
3
  from dataclasses import dataclass
3
- from functools import partial
4
+ from functools import cache, partial
4
5
  from itertools import permutations
5
6
  from math import factorial
6
7
  from typing import Any
@@ -73,8 +74,11 @@ class Game(ABC):
73
74
 
74
75
  for i, value in enumerate(self.values(*strategies)):
75
76
  opponent_strategies = strategies[:i] + strategies[i + 1:]
76
- _, br_value = self.best_response(i, *opponent_strategies)
77
- gap += br_value - value
77
+ _, best_response_value = self.best_response(
78
+ i,
79
+ *opponent_strategies,
80
+ )
81
+ gap += best_response_value - value
78
82
 
79
83
  return gap
80
84
 
@@ -86,8 +90,11 @@ class Game(ABC):
86
90
  average_opponent_strategies = (
87
91
  average_strategies[:i] + average_strategies[i + 1:]
88
92
  )
89
- _, br_value = self.best_response(i, *average_opponent_strategies)
90
- gap += br_value - value
93
+ _, best_response_value = self.best_response(
94
+ i,
95
+ *average_opponent_strategies,
96
+ )
97
+ gap += best_response_value - value
91
98
 
92
99
  return gap
93
100
 
@@ -613,3 +620,169 @@ class SymmetrizedGame(Game):
613
620
 
614
621
  def best_response(self, player, *opponent_strategies):
615
622
  raise NotImplementedError
623
+
624
+
625
+ class ExtensiveFormGame2(ABC):
626
+ """Extensive-form game (EFG)."""
627
+
628
+ @dataclass(frozen=True)
629
+ class State:
630
+ """State of an extensive-form game."""
631
+
632
+ @property
633
+ @abstractmethod
634
+ def utilities(self):
635
+ pass
636
+
637
+ @property
638
+ @abstractmethod
639
+ def chance_action_probabilities(self):
640
+ pass
641
+
642
+ @property
643
+ @abstractmethod
644
+ def actions(self):
645
+ pass
646
+
647
+ @property
648
+ @abstractmethod
649
+ def infoset(self):
650
+ pass
651
+
652
+ @property
653
+ @abstractmethod
654
+ def player(self):
655
+ pass
656
+
657
+ @abstractmethod
658
+ def is_terminal(self):
659
+ pass
660
+
661
+ @abstractmethod
662
+ def is_chance(self):
663
+ pass
664
+
665
+ @abstractmethod
666
+ def utility(self, player):
667
+ pass
668
+
669
+ @abstractmethod
670
+ def apply(self, action):
671
+ pass
672
+
673
+ @property
674
+ @abstractmethod
675
+ def players(self):
676
+ pass
677
+
678
+ @property
679
+ @abstractmethod
680
+ def initial_state(self):
681
+ pass
682
+
683
+ def values(self, strategy_profile, state=None):
684
+ if state is None:
685
+ values = self.values(strategy_profile, self.initial_state)
686
+ elif state.is_terminal():
687
+ values = state.utilities
688
+ else:
689
+ if state.is_chance():
690
+ actions, probabilities = zip(
691
+ *state.chance_action_probabilities,
692
+ )
693
+ else:
694
+ actions = state.actions
695
+ probabilities = strategy_profile(state)
696
+
697
+ values = 0
698
+
699
+ for action, probability in zip(actions, probabilities):
700
+ values += (
701
+ probability
702
+ * self.values(strategy_profile, state.apply(action))
703
+ )
704
+
705
+ return values
706
+
707
+ def best_response_value(self, player, strategy_profile):
708
+ states = defaultdict(list)
709
+ counterfactual_reach_probabilities = {}
710
+
711
+ def dfs(state, counterfactual_reach_probability):
712
+ counterfactual_reach_probabilities[state] = (
713
+ counterfactual_reach_probability
714
+ )
715
+
716
+ if state.is_terminal():
717
+ return
718
+
719
+ if not state.is_chance():
720
+ states[state.infoset].append(state)
721
+
722
+ if state.is_chance() or state.player != player:
723
+ if state.is_chance():
724
+ actions, probabilities = zip(
725
+ *state.chance_action_probabilities,
726
+ )
727
+ else:
728
+ actions = state.actions
729
+ probabilities = strategy_profile(state)
730
+
731
+ for action, probability in zip(actions, probabilities):
732
+ dfs(
733
+ state.apply(action),
734
+ probability * counterfactual_reach_probability,
735
+ )
736
+ else:
737
+ for action in state.actions:
738
+ dfs(state.apply(action), counterfactual_reach_probability)
739
+
740
+ dfs(self.initial_state, 1)
741
+
742
+ @cache
743
+ def solve(state):
744
+ if state.is_terminal():
745
+ value = state.utility(player)
746
+ elif state.is_chance() or state.player != player:
747
+ if state.is_chance():
748
+ actions, probabilities = zip(
749
+ *state.chance_action_probabilities,
750
+ )
751
+ else:
752
+ actions = state.actions
753
+ probabilities = strategy_profile(state)
754
+
755
+ value = 0
756
+
757
+ for action, probability in zip(actions, probabilities):
758
+ value += probability * solve(state.apply(action))
759
+ else:
760
+ value = solve2(state.infoset)
761
+
762
+ return value
763
+
764
+ @cache
765
+ def solve2(infoset):
766
+ values = defaultdict(int)
767
+
768
+ for state in states[infoset]:
769
+ weight = counterfactual_reach_probabilities[state]
770
+
771
+ for i, action in enumerate(state.actions):
772
+ values[i] += weight * solve(state.apply(action))
773
+
774
+ return max(values.values())
775
+
776
+ return solve(self.initial_state)
777
+
778
+ def nash_gap(self, strategy_profile):
779
+ gap = 0
780
+
781
+ for player, value in zip(self.players, self.values(strategy_profile)):
782
+ best_response_value = self.best_response_value(
783
+ player,
784
+ strategy_profile,
785
+ )
786
+ gap += best_response_value - value
787
+
788
+ return gap
@@ -10,6 +10,7 @@ import numpy as np
10
10
 
11
11
  from noregret.utilities import (
12
12
  euclidean_projection_on_probability_simplex,
13
+ sample,
13
14
  split,
14
15
  stationary_distribution,
15
16
  )
@@ -52,6 +53,9 @@ class RegretMinimizer(ABC):
52
53
  def next_strategy(self, prediction=False):
53
54
  pass
54
55
 
56
+ def undo_next_strategy(self):
57
+ self.strategies.pop()
58
+
55
59
  def observe_utility(self, utility):
56
60
  if len(self.strategies) == len(self.utilities):
57
61
  raise ValueError('next strategy not yet outputted')
@@ -390,6 +394,9 @@ class BlumMansour(ProbabilitySimplexSwapRegretMinimizer):
390
394
 
391
395
  return strategy
392
396
 
397
+ def undo_next_strategy(self):
398
+ raise NotImplementedError
399
+
393
400
  def observe_utility(self, utility):
394
401
  super().observe_utility(utility)
395
402
 
@@ -471,6 +478,9 @@ class CounterfactualRegretMinimization(SequenceFormPolytopeRegretMinimizer):
471
478
 
472
479
  return strategy
473
480
 
481
+ def undo_next_strategy(self):
482
+ raise NotImplementedError
483
+
474
484
  def observe_utility(self, utility):
475
485
  super().observe_utility(utility)
476
486
 
@@ -568,6 +578,9 @@ class CartesianProductRegretCircuit(RegretCircuit):
568
578
 
569
579
  return strategy
570
580
 
581
+ def undo_next_strategy(self):
582
+ raise NotImplementedError
583
+
571
584
  def observe_utility(self, utility):
572
585
  super().observe_utility(utility)
573
586
 
@@ -634,6 +647,9 @@ class ConvexHullRegretCircuit(RegretCircuit):
634
647
 
635
648
  return strategy
636
649
 
650
+ def undo_next_strategy(self):
651
+ raise NotImplementedError
652
+
637
653
  def observe_utility(self, utility):
638
654
  super().observe_utility(utility)
639
655
 
@@ -643,3 +659,152 @@ class ConvexHullRegretCircuit(RegretCircuit):
643
659
  self.previous_outputs = self.outputs.copy()
644
660
 
645
661
  self.mixing_regret_minimizer.observe_utility(self.outputs @ utility)
662
+
663
+
664
+ @dataclass
665
+ class StochasticRegretMinimization(ABC):
666
+ """Stochastic regret minimization."""
667
+
668
+ extensive_form_game: Any
669
+
670
+ @property
671
+ def average_strategy_profile(self):
672
+ return lambda state: (
673
+ self._local_regret_minimizer(state).average_strategy
674
+ )
675
+
676
+ @abstractmethod
677
+ def _local_regret_minimizer(self, state):
678
+ pass
679
+
680
+ def external_sampling(self):
681
+ for player in self.extensive_form_game.players:
682
+ self._external_sampling(
683
+ player,
684
+ self.extensive_form_game.initial_state,
685
+ )
686
+
687
+ def _external_sampling(self, player, state):
688
+ if state.is_terminal():
689
+ utility = state.utility(player)
690
+ elif state.is_chance():
691
+ actions, probabilities = zip(*state.chance_action_probabilities)
692
+ action = sample(actions, probabilities)
693
+ utility = self._external_sampling(player, state.apply(action))
694
+ else:
695
+ local_regret_minimizer = self._local_regret_minimizer(state)
696
+ actions = state.actions
697
+ probabilities = local_regret_minimizer.next_strategy()
698
+
699
+ if state.player == player:
700
+ utilities = list(
701
+ map(
702
+ partial(self._external_sampling, player),
703
+ map(state.apply, actions),
704
+ ),
705
+ )
706
+ utility = utilities @ probabilities
707
+
708
+ local_regret_minimizer.observe_utility(utilities)
709
+ else:
710
+ action = sample(actions, probabilities)
711
+ utility = self._external_sampling(player, state.apply(action))
712
+
713
+ local_regret_minimizer.undo_next_strategy()
714
+
715
+ return utility
716
+
717
+ def outcome_sampling(self, reference_strategy_profile):
718
+ for player in self.extensive_form_game.players:
719
+ self._outcome_sampling(
720
+ reference_strategy_profile,
721
+ player,
722
+ self.extensive_form_game.initial_state,
723
+ 1,
724
+ )
725
+
726
+ def _outcome_sampling(
727
+ self,
728
+ reference_strategy_profile,
729
+ player,
730
+ state,
731
+ reference_reach_probability,
732
+ ):
733
+ if state.is_terminal():
734
+ utility = state.utility(player) / reference_reach_probability
735
+ elif state.is_chance():
736
+ actions, probabilities = zip(*state.chance_action_probabilities)
737
+ action = sample(actions, probabilities)
738
+ utility = self._outcome_sampling(
739
+ reference_strategy_profile,
740
+ player,
741
+ state.apply(action),
742
+ reference_reach_probability,
743
+ )
744
+ else:
745
+ local_regret_minimizer = self._local_regret_minimizer(state)
746
+ actions = state.actions
747
+
748
+ if state.player == player:
749
+ probabilities = reference_strategy_profile(state)
750
+ index = sample(range(len(actions)), probabilities)
751
+ action = actions[index]
752
+ probability = probabilities[index]
753
+ utility = (
754
+ probability
755
+ * self._outcome_sampling(
756
+ reference_strategy_profile,
757
+ player,
758
+ state.apply(action),
759
+ probability * reference_reach_probability,
760
+ )
761
+ )
762
+ utilities = np.zeros(len(actions))
763
+ utilities[index] = utility
764
+
765
+ local_regret_minimizer.next_strategy()
766
+ local_regret_minimizer.observe_utility(utilities)
767
+ else:
768
+ probabilities = local_regret_minimizer.next_strategy()
769
+ action = sample(actions, probabilities)
770
+ utility = self._outcome_sampling(
771
+ reference_strategy_profile,
772
+ player,
773
+ state.apply(action),
774
+ reference_reach_probability,
775
+ )
776
+
777
+ local_regret_minimizer.undo_next_strategy()
778
+
779
+ return utility
780
+
781
+
782
+ @dataclass
783
+ class MonteCarloCounterfactualRegretMinimization(StochasticRegretMinimization):
784
+ """Monte Carlo Counterfactual regret minimization (MCCFR)."""
785
+
786
+ regret_minimizer_factory: Any = partial(
787
+ RegretMatching,
788
+ is_time_symmetric=True,
789
+ )
790
+ _: KW_ONLY
791
+ local_regret_minimizers: Any = field(init=False, default_factory=dict)
792
+
793
+ @property
794
+ def iteration_count(self):
795
+ iteration_count = 0
796
+
797
+ for R in self.local_regret_minimizers.values():
798
+ iteration_count += R.iteration_count
799
+
800
+ return iteration_count
801
+
802
+ def _local_regret_minimizer(self, state):
803
+ if state.infoset in self.local_regret_minimizers:
804
+ R = self.local_regret_minimizers[state.infoset]
805
+ else:
806
+ action_count = len(state.actions)
807
+ R = self.regret_minimizer_factory(action_count)
808
+ self.local_regret_minimizers[state.infoset] = R
809
+
810
+ return R
@@ -6,6 +6,7 @@ from functools import partial
6
6
  from importlib import import_module
7
7
  from json import dump, dumps, load, loads
8
8
  from math import inf
9
+ from random import choices
9
10
  from typing import Any
10
11
 
11
12
  from ordered_set import OrderedSet
@@ -47,7 +48,7 @@ def stationary_distribution(stochastic_matrix):
47
48
  P = stochastic_matrix
48
49
 
49
50
  if not np.allclose(P.sum(1), 1):
50
- raise ValueError('matrix not stochastic')
51
+ raise ValueError('matrix not left stochastic')
51
52
 
52
53
  eigenvalues, eigenvectors = LA.eig(P.T)
53
54
  pi = eigenvectors[:, np.isclose(eigenvalues, 1)][:, 0]
@@ -96,6 +97,10 @@ def split(values, counts):
96
97
  return splits
97
98
 
98
99
 
100
+ def sample(values, probabilities):
101
+ return choices(values, probabilities)[0]
102
+
103
+
99
104
  class Serializable(ABC):
100
105
  @classmethod
101
106
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: noregret
3
- Version: 0.0.0.dev1
3
+ Version: 0.0.0.dev3
4
4
  Summary: No-regret learning dynamics
5
5
  Home-page: https://github.com/uoftcprg/noregret
6
6
  Author: Universal, Open, Free, and Transparent Computer Poker Research Group
@@ -4,7 +4,7 @@ from setuptools import find_packages, setup
4
4
 
5
5
  setup(
6
6
  name='noregret',
7
- version='0.0.0.dev1',
7
+ version='0.0.0.dev3',
8
8
  description='No-regret learning dynamics',
9
9
  long_description=open('README.md').read(),
10
10
  long_description_content_type='text/markdown',
File without changes
File without changes
File without changes