PyPI - pyRDDLGym-jax - Versions diffs - 2.3__py3-none-any.whl → 2.4__py3-none-any.whl - Mend

pyRDDLGym-jax 2.3py3-none-any.whl → 2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

pyRDDLGym_jax/__init__.py +1 -1
pyRDDLGym_jax/core/compiler.py +2 -3
pyRDDLGym_jax/core/logic.py +117 -66
pyRDDLGym_jax/core/planner.py +489 -218
pyRDDLGym_jax/core/tuning.py +28 -22
pyRDDLGym_jax/examples/run_plan.py +2 -2
pyRDDLGym_jax/examples/run_scipy.py +2 -2
{pyrddlgym_jax-2.3.dist-info → pyrddlgym_jax-2.4.dist-info}/METADATA +1 -1
{pyrddlgym_jax-2.3.dist-info → pyrddlgym_jax-2.4.dist-info}/RECORD +13 -13
{pyrddlgym_jax-2.3.dist-info → pyrddlgym_jax-2.4.dist-info}/LICENSE +0 -0
{pyrddlgym_jax-2.3.dist-info → pyrddlgym_jax-2.4.dist-info}/WHEEL +0 -0
{pyrddlgym_jax-2.3.dist-info → pyrddlgym_jax-2.4.dist-info}/entry_points.txt +0 -0
{pyrddlgym_jax-2.3.dist-info → pyrddlgym_jax-2.4.dist-info}/top_level.txt +0 -0

pyRDDLGym_jax/core/planner.py CHANGED Viewed

@@ -3,7 +3,7 @@
 #
 # Author: Michael Gimelfarb
 #
-# RELEVANT SOURCES:
+# REFERENCES:
 #
 # [1] Gimelfarb, Michael, Ayal Taitler, and Scott Sanner. "JaxPlan and GurobiPlan:
 # Optimization Baselines for Replanning in Discrete and Mixed Discrete-Continuous
@@ -18,16 +18,21 @@
 # reactive policies for planning in stochastic nonlinear domains." In Proceedings of the
 # AAAI Conference on Artificial Intelligence, vol. 33, no. 01, pp. 7530-7537. 2019.
 #
-# [4] Wu, Ga, Buser Say, and Scott Sanner. "Scalable planning with tensorflow for hybrid
+# [4] Cui, Hao, Thomas Keller, and Roni Khardon. "Stochastic planning with lifted symbolic
+# trajectory optimization." In Proceedings of the International Conference on Automated
+# Planning and Scheduling, vol. 29, pp. 119-127. 2019.
+#
+# [5] Wu, Ga, Buser Say, and Scott Sanner. "Scalable planning with tensorflow for hybrid
 # nonlinear domains." Advances in Neural Information Processing Systems 30 (2017).
 #
-# [5] Sehnke, Frank, and Tingting Zhao. "Baseline-free sampling in parameter exploring
+# [6] Sehnke, Frank, and Tingting Zhao. "Baseline-free sampling in parameter exploring
 # policy gradients: Super symmetric pgpe." Artificial Neural Networks: Methods and
 # Applications in Bio-/Neuroinformatics. Springer International Publishing, 2015.
 #
 # ***********************************************************************
+from abc import ABCMeta, abstractmethod
 from ast import literal_eval
 from collections import deque
 import configparser
@@ -37,7 +42,8 @@ import os
 import sys
 import time
 import traceback
-from typing import Any, Callable, Dict, Generator, Optional, Set, Sequence, Type, Tuple, Union
+from typing import Any, Callable, Dict, Generator, Optional, Set, Sequence, Type, Tuple, \
+    Union
 import haiku as hk
 import jax
@@ -51,6 +57,7 @@ from tqdm import tqdm, TqdmWarning
 import warnings
 warnings.filterwarnings("ignore", category=TqdmWarning)
+from pyRDDLGym.core.compiler.initializer import RDDLValueInitializer
 from pyRDDLGym.core.compiler.model import RDDLPlanningModel, RDDLLiftedModel
 from pyRDDLGym.core.debug.logger import Logger
 from pyRDDLGym.core.debug.exception import (
@@ -157,25 +164,20 @@ def _load_config(config, args):
         initializer = _getattr_any(
             packages=[initializers, hk.initializers], item=plan_initializer)
         if initializer is None:
-            raise_warning(f'Ignoring invalid initializer <{plan_initializer}>.', 'red')
-            del plan_kwargs['initializer']
+            raise ValueError(f'Invalid initializer <{plan_initializer}>.')
         else:
             init_kwargs = plan_kwargs.pop('initializer_kwargs', {})
             try:
                 plan_kwargs['initializer'] = initializer(**init_kwargs)
             except Exception as _:
-                raise_warning(
-                    f'Ignoring invalid initializer_kwargs <{init_kwargs}>.', 'red')
-                plan_kwargs['initializer'] = initializer
+                raise ValueError(f'Invalid initializer kwargs <{init_kwargs}>.')
     # policy activation
     plan_activation = plan_kwargs.get('activation', None)
     if plan_activation is not None:
-        activation = _getattr_any(
-            packages=[jax.nn, jax.numpy], item=plan_activation)
+        activation = _getattr_any(packages=[jax.nn, jax.numpy], item=plan_activation)
         if activation is None:
-            raise_warning(f'Ignoring invalid activation <{plan_activation}>.', 'red')
-            del plan_kwargs['activation']
+            raise ValueError(f'Invalid activation <{plan_activation}>.')
         else:
             plan_kwargs['activation'] = activation
@@ -188,8 +190,7 @@ def _load_config(config, args):
     if planner_optimizer is not None:
         optimizer = _getattr_any(packages=[optax], item=planner_optimizer)
         if optimizer is None:
-            raise_warning(f'Ignoring invalid optimizer <{planner_optimizer}>.', 'red')
-            del planner_args['optimizer']
+            raise ValueError(f'Invalid optimizer <{planner_optimizer}>.')
         else:
             planner_args['optimizer'] = optimizer
@@ -200,8 +201,7 @@ def _load_config(config, args):
         if 'optimizer' in pgpe_kwargs:
             pgpe_optimizer = _getattr_any(packages=[optax], item=pgpe_kwargs['optimizer'])
             if pgpe_optimizer is None:
-                raise_warning(f'Ignoring invalid optimizer <{pgpe_optimizer}>.', 'red')
-                del pgpe_kwargs['optimizer']
+                raise ValueError(f'Invalid optimizer <{pgpe_optimizer}>.')
             else:
                 pgpe_kwargs['optimizer'] = pgpe_optimizer
         planner_args['pgpe'] = getattr(sys.modules[__name__], pgpe_method)(**pgpe_kwargs)
@@ -260,8 +260,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
                  cpfs_without_grad: Optional[Set[str]]=None,
                  **kwargs) -> None:
         '''Creates a new RDDL to Jax compiler, where operations that are not
-        differentiable are converted to approximate forms that have defined
-        gradients.
+        differentiable are converted to approximate forms that have defined gradients.
         :param *args: arguments to pass to base compiler
         :param logic: Fuzzy logic object that specifies how exact operations
@@ -286,8 +285,10 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
             if not np.issubdtype(np.result_type(values), np.floating):
                 pvars_cast.add(var)
         if pvars_cast:
-            raise_warning(f'JAX gradient compiler requires that initial values '
-                          f'of p-variables {pvars_cast} be cast to float.')
+            message = termcolor.colored(
+                f'[INFO] JAX gradient compiler will cast p-vars {pvars_cast} to float.',
+                'green')
+            print(message)
         # overwrite basic operations with fuzzy ones
         self.OPS = logic.get_operator_dicts()
@@ -300,6 +301,8 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         return _jax_wrapped_stop_grad
     def _compile_cpfs(self, init_params):
+        # cpfs will all be cast to float
         cpfs_cast = set()
         jax_cpfs = {}
         for (_, cpfs) in self.levels.items():
@@ -312,11 +315,15 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
                     jax_cpfs[cpf] = self._jax_stop_grad(jax_cpfs[cpf])
         if cpfs_cast:
-            raise_warning(f'JAX gradient compiler requires that outputs of CPFs '
-                          f'{cpfs_cast} be cast to float.')
+            message = termcolor.colored(
+                f'[INFO] JAX gradient compiler will cast CPFs {cpfs_cast} to float.',
+                'green')
+            print(message)
         if self.cpfs_without_grad:
-            raise_warning(f'User requested that gradients not flow '
-                          f'through CPFs {self.cpfs_without_grad}.')
+            message = termcolor.colored(
+                f'[INFO] Gradients will not flow through CPFs {self.cpfs_without_grad}.',
+                'green')
+            print(message)
         return jax_cpfs
@@ -335,7 +342,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
 # ***********************************************************************
-class JaxPlan:
+class JaxPlan(metaclass=ABCMeta):
     '''Base class for all JAX policy representations.'''
     def __init__(self) -> None:
@@ -345,16 +352,18 @@ class JaxPlan:
         self._projection = None
         self.bounds = None
-    def summarize_hyperparameters(self) -> None:
-        print(self.__str__())
+    def summarize_hyperparameters(self) -> str:
+        return self.__str__()
+    @abstractmethod
     def compile(self, compiled: JaxRDDLCompilerWithGrad,
                 _bounds: Bounds,
                 horizon: int) -> None:
-        raise NotImplementedError
+        pass
+    @abstractmethod
     def guess_next_epoch(self, params: Pytree) -> Pytree:
-        raise NotImplementedError
+        pass
     @property
     def initializer(self):
@@ -397,10 +406,11 @@ class JaxPlan:
                 continue
             # check invalid type
-            if prange not in compiled.JAX_TYPES:
+            if prange not in compiled.JAX_TYPES and prange not in compiled.rddl.enum_types:
+                keys = list(compiled.JAX_TYPES.keys()) + list(compiled.rddl.enum_types)
                 raise RDDLTypeError(
                     f'Invalid range <{prange}> of action-fluent <{name}>, '
-                    f'must be one of {set(compiled.JAX_TYPES.keys())}.')
+                    f'must be one of {keys}.')
             # clip boolean to (0, 1), otherwise use the RDDL action bounds
             # or the user defined action bounds if provided
@@ -408,7 +418,12 @@ class JaxPlan:
             if prange == 'bool':
                 lower, upper = None, None
             else:
-                lower, upper = compiled.constraints.bounds[name]
+                if prange in compiled.rddl.enum_types:
+                    lower = np.zeros(shape=shapes[name][1:])
+                    upper = len(compiled.rddl.type_to_objects[prange]) - 1
+                    upper = np.ones(shape=shapes[name][1:]) * upper
+                else:
+                    lower, upper = compiled.constraints.bounds[name]
                 lower, upper = user_bounds.get(name, (lower, upper))
                 lower = np.asarray(lower, dtype=compiled.REAL)
                 upper = np.asarray(upper, dtype=compiled.REAL)
@@ -421,7 +436,10 @@ class JaxPlan:
                                     ~lower_finite & upper_finite,
                                     ~lower_finite & ~upper_finite]
             bounds[name] = (lower, upper)
-            raise_warning(f'Bounds of action-fluent <{name}> set to {bounds[name]}.')
+            message = termcolor.colored(
+                f'[INFO] Bounds of action-fluent <{name}> set to {bounds[name]}.',
+                'green')
+            print(message)
         return shapes, bounds, bounds_safe, cond_lists
     def _count_bool_actions(self, rddl: RDDLLiftedModel):
@@ -502,10 +520,11 @@ class JaxStraightLinePlan(JaxPlan):
         bool_action_count, allowed_actions = self._count_bool_actions(rddl)
         use_constraint_satisfaction = allowed_actions < bool_action_count
         if use_constraint_satisfaction:
-            raise_warning(f'Using projected gradient trick to satisfy '
-                          f'max_nondef_actions: total boolean actions '
-                          f'{bool_action_count} > max_nondef_actions '
-                          f'{allowed_actions}.')
+            message = termcolor.colored(
+                f'[INFO] SLP will use projected gradient to satisfy '
+                f'max_nondef_actions since total boolean actions '
+                f'{bool_action_count} > max_nondef_actions {allowed_actions}.', 'green')
+            print(message)
         noop = {var: (values[0] if isinstance(values, list) else values)
                 for (var, values) in rddl.action_fluents.items()}
@@ -623,7 +642,7 @@ class JaxStraightLinePlan(JaxPlan):
                 else:
                     action = _jax_non_bool_param_to_action(var, action, hyperparams)
                     action = jnp.clip(action, *bounds[var])
-                    if ranges[var] == 'int':
+                    if ranges[var] == 'int' or ranges[var] in rddl.enum_types:
                         action = jnp.asarray(jnp.round(action), dtype=compiled.INT)
                     actions[var] = action
             return actions
@@ -642,7 +661,7 @@ class JaxStraightLinePlan(JaxPlan):
             # only allow one action non-noop for now
             if 1 < allowed_actions < bool_action_count:
                 raise RDDLNotImplementedError(
-                    f'Straight-line plans with wrap_softmax currently '
+                    f'SLPs with wrap_softmax currently '
                     f'do not support max-nondef-actions {allowed_actions} > 1.')
             # potentially apply projection but to non-bool actions only
@@ -764,7 +783,8 @@ class JaxStraightLinePlan(JaxPlan):
                 for (var, action) in actions.items():
                     if ranges[var] == 'bool':
                         action = jnp.clip(action, min_action, max_action)
-                        new_params[var] = _jax_bool_action_to_param(var, action, hyperparams)
+                        param = _jax_bool_action_to_param(var, action, hyperparams)
+                        new_params[var] = param
                     else:
                         new_params[var] = action
                 return new_params, converged
@@ -890,8 +910,7 @@ class JaxDeepReactivePolicy(JaxPlan):
         bool_action_count, allowed_actions = self._count_bool_actions(rddl)
         if 1 < allowed_actions < bool_action_count:
             raise RDDLNotImplementedError(
-                f'Deep reactive policies currently do not support '
-                f'max-nondef-actions {allowed_actions} > 1.')
+                f'DRPs currently do not support max-nondef-actions {allowed_actions} > 1.')
         use_constraint_satisfaction = allowed_actions < bool_action_count
         noop = {var: (values[0] if isinstance(values, list) else values)
@@ -927,15 +946,17 @@ class JaxDeepReactivePolicy(JaxPlan):
                 if ranges[var] != 'bool':
                     value_size = np.size(values)
                     if normalize_per_layer and value_size == 1:
-                        raise_warning(
-                            f'Cannot apply layer norm to state-fluent <{var}> '
-                            f'of size 1: setting normalize_per_layer = False.', 'red')
+                        message = termcolor.colored(
+                            f'[WARN] Cannot apply layer norm to state-fluent <{var}> '
+                            f'of size 1: setting normalize_per_layer = False.', 'yellow')
+                        print(message)
                         normalize_per_layer = False
                     non_bool_dims += value_size
             if not normalize_per_layer and non_bool_dims == 1:
-                raise_warning(
-                    'Cannot apply layer norm to state-fluents of total size 1: '
-                    'setting normalize = False.', 'red')
+                message = termcolor.colored(
+                    '[WARN] Cannot apply layer norm to state-fluents of total size 1: '
+                    'setting normalize = False.', 'yellow')
+                print(message)
                 normalize = False
         # convert subs dictionary into a state vector to feed to the MLP
@@ -1061,7 +1082,7 @@ class JaxDeepReactivePolicy(JaxPlan):
                 prange = ranges[var]
                 if prange == 'bool':
                     new_action = action > 0.5
-                elif prange == 'int':
+                elif prange == 'int' or prange in rddl.enum_types:
                     action = jnp.clip(action, *bounds[var])
                     new_action = jnp.asarray(jnp.round(action), dtype=compiled.INT)
                 else:
@@ -1112,19 +1133,18 @@ class JaxDeepReactivePolicy(JaxPlan):
 class RollingMean:
-    '''Maintains an estimate of the rolling mean of a stream of real-valued
-    observations.'''
+    '''Maintains the rolling mean of a stream of real-valued observations.'''
     def __init__(self, window_size: int) -> None:
         self._window_size = window_size
         self._memory = deque(maxlen=window_size)
         self._total = 0
-    def update(self, x: float) -> float:
+    def update(self, x: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
         memory = self._memory
-        self._total += x
+        self._total = self._total + x
         if len(memory) == self._window_size:
-            self._total -= memory.popleft()
+            self._total = self._total - memory.popleft()
         memory.append(x)
         return self._total / len(memory)
@@ -1147,14 +1167,16 @@ class JaxPlannerStatus(Enum):
         return self.value == 1 or self.value >= 4
-class JaxPlannerStoppingRule:
+class JaxPlannerStoppingRule(metaclass=ABCMeta):
     '''The base class of all planner stopping rules.'''
+    @abstractmethod
     def reset(self) -> None:
-        raise NotImplementedError
+        pass
+    @abstractmethod
     def monitor(self, callback: Dict[str, Any]) -> bool:
-        raise NotImplementedError
+        pass
 class NoImprovementStoppingRule(JaxPlannerStoppingRule):
@@ -1168,8 +1190,7 @@ class NoImprovementStoppingRule(JaxPlannerStoppingRule):
         self.iters_since_last_update = 0
     def monitor(self, callback: Dict[str, Any]) -> bool:
-        if self.callback is None \
-        or callback['best_return'] > self.callback['best_return']:
+        if self.callback is None or callback['best_return'] > self.callback['best_return']:
             self.callback = callback
             self.iters_since_last_update = 0
         else:
@@ -1188,7 +1209,7 @@ class NoImprovementStoppingRule(JaxPlannerStoppingRule):
 # ***********************************************************************
-class PGPE:
+class PGPE(metaclass=ABCMeta):
     """Base class for all PGPE strategies."""
     def __init__(self) -> None:
@@ -1203,8 +1224,10 @@ class PGPE:
     def update(self):
         return self._update
-    def compile(self, loss_fn: Callable, projection: Callable, real_dtype: Type) -> None:
-        raise NotImplementedError
+    @abstractmethod
+    def compile(self, loss_fn: Callable, projection: Callable, real_dtype: Type,
+                parallel_updates: Optional[int]=None) -> None:
+        pass
 class GaussianPGPE(PGPE):
@@ -1268,10 +1291,11 @@ class GaussianPGPE(PGPE):
             mu_optimizer = optax.inject_hyperparams(optimizer)(**optimizer_kwargs_mu)
             sigma_optimizer = optax.inject_hyperparams(optimizer)(**optimizer_kwargs_sigma)
         except Exception as _:
-            raise_warning(
-                f'Failed to inject hyperparameters into optax optimizer for PGPE, '
-                'rolling back to safer method: please note that kl-divergence '
-                'constraints will be disabled.', 'red')
+            message = termcolor.colored(
+                '[FAIL] Failed to inject hyperparameters into PGPE optimizer, '
+                'rolling back to safer method: '
+                'kl-divergence constraint will be disabled.', 'red')
+            print(message)
             mu_optimizer = optimizer(**optimizer_kwargs_mu)
             sigma_optimizer = optimizer(**optimizer_kwargs_sigma)
             max_kl_update = None
@@ -1297,15 +1321,16 @@ class GaussianPGPE(PGPE):
                 f'    max_kl_update      ={self.max_kl}\n'
         )
-    def compile(self, loss_fn: Callable, projection: Callable, real_dtype: Type) -> None:
+    def compile(self, loss_fn: Callable, projection: Callable, real_dtype: Type,
+                parallel_updates: Optional[int]=None) -> None:
         sigma0 = self.init_sigma
-        sigma_range = self.sigma_range
+        sigma_lo, sigma_hi = self.sigma_range
         scale_reward = self.scale_reward
         min_reward_scale = self.min_reward_scale
         super_symmetric = self.super_symmetric
         super_symmetric_accurate = self.super_symmetric_accurate
         batch_size = self.batch_size
-        optimizers = (mu_optimizer, sigma_optimizer) = self.optimizers
+        mu_optimizer, sigma_optimizer = self.optimizers
         max_kl = self.max_kl
         # entropy regularization penalty is decayed exponentially by elapsed budget
@@ -1322,13 +1347,22 @@ class GaussianPGPE(PGPE):
         def _jax_wrapped_pgpe_init(key, policy_params):
             mu = policy_params
-            sigma = jax.tree_map(lambda x: sigma0 * jnp.ones_like(x), mu)
+            sigma = jax.tree_map(partial(jnp.full_like, fill_value=sigma0), mu)
             pgpe_params = (mu, sigma)
-            pgpe_opt_state = tuple(opt.init(param)
-                                   for (opt, param) in zip(optimizers, pgpe_params))
-            return pgpe_params, pgpe_opt_state
+            pgpe_opt_state = (mu_optimizer.init(mu), sigma_optimizer.init(sigma))
+            r_max = -jnp.inf
+            return pgpe_params, pgpe_opt_state, r_max
-        self._initializer = jax.jit(_jax_wrapped_pgpe_init)
+        if parallel_updates is None:
+            self._initializer = jax.jit(_jax_wrapped_pgpe_init)
+        else:
+            # for parallel policy update
+            def _jax_wrapped_pgpe_inits(key, policy_params):
+                keys = jnp.asarray(random.split(key, num=parallel_updates))
+                return jax.vmap(_jax_wrapped_pgpe_init, in_axes=0)(keys, policy_params)
+            self._initializer = jax.jit(_jax_wrapped_pgpe_inits)
         # ***********************************************************************
         # PARAMETER SAMPLING FUNCTIONS
@@ -1338,6 +1372,8 @@ class GaussianPGPE(PGPE):
         def _jax_wrapped_mu_noise(key, sigma):
             return sigma * random.normal(key, shape=jnp.shape(sigma), dtype=real_dtype)
+        # this samples a noise variable epsilon* from epsilon with the N(0, 1) density
+        # according to super-symmetric sampling paper
         def _jax_wrapped_epsilon_star(sigma, epsilon):
             c1, c2, c3 = -0.06655, -0.9706, 0.124
             phi = 0.67449 * sigma
@@ -1354,6 +1390,7 @@ class GaussianPGPE(PGPE):
                 epsilon_star = jnp.sign(epsilon) * phi * jnp.exp(a)
             return epsilon_star
+        # implements baseline-free super-symmetric sampling to generate 4 trajectories
         def _jax_wrapped_sample_params(key, mu, sigma):
             treedef = jax.tree_util.tree_structure(sigma)
             keys = random.split(key, num=treedef.num_leaves)
@@ -1374,6 +1411,7 @@ class GaussianPGPE(PGPE):
         #
         # ***********************************************************************
+        # gradient with respect to mean
         def _jax_wrapped_mu_grad(epsilon, epsilon_star, r1, r2, r3, r4, m):
             if super_symmetric:
                 if scale_reward:
@@ -1393,6 +1431,7 @@ class GaussianPGPE(PGPE):
                 grad = -r_mu * epsilon
             return grad
+        #  gradient with respect to std. deviation
         def _jax_wrapped_sigma_grad(epsilon, epsilon_star, sigma, r1, r2, r3, r4, m, ent):
             if super_symmetric:
                 mask = r1 + r2 >= r3 + r4
@@ -1413,6 +1452,7 @@ class GaussianPGPE(PGPE):
             grad = -(r_sigma * s + ent / sigma)
             return grad
+        # calculate the policy gradients
         def _jax_wrapped_pgpe_grad(key, mu, sigma, r_max, ent,
                                    policy_hyperparams, subs, model_params):
             key, subkey = random.split(key)
@@ -1462,11 +1502,24 @@ class GaussianPGPE(PGPE):
         #
         # ***********************************************************************
+        # estimate KL divergence between two updates
         def _jax_wrapped_pgpe_kl_term(mu, sigma, old_mu, old_sigma):
             return 0.5 * jnp.sum(2 * jnp.log(sigma / old_sigma) +
                                  jnp.square(old_sigma / sigma) +
                                  jnp.square((mu - old_mu) / sigma) - 1)
+        # update mean and std. deviation with a gradient step
+        def _jax_wrapped_pgpe_update_helper(mu, sigma, mu_grad, sigma_grad,
+                                            mu_state, sigma_state):
+            mu_updates, new_mu_state = mu_optimizer.update(mu_grad, mu_state, params=mu)
+            sigma_updates, new_sigma_state = sigma_optimizer.update(
+                sigma_grad, sigma_state, params=sigma)
+            new_mu = optax.apply_updates(mu, mu_updates)
+            new_sigma = optax.apply_updates(sigma, sigma_updates)
+            new_sigma = jax.tree_map(
+                partial(jnp.clip, min=sigma_lo, max=sigma_hi), new_sigma)
+            return new_mu, new_sigma, new_mu_state, new_sigma_state
         def _jax_wrapped_pgpe_update(key, pgpe_params, r_max, progress,
                                      policy_hyperparams, subs, model_params,
                                      pgpe_opt_state):
@@ -1476,12 +1529,9 @@ class GaussianPGPE(PGPE):
             ent = start_entropy_coeff * jnp.power(entropy_coeff_decay, progress)
             mu_grad, sigma_grad, new_r_max = _jax_wrapped_pgpe_grad_batched(
                 key, pgpe_params, r_max, ent, policy_hyperparams, subs, model_params)
-            mu_updates, new_mu_state = mu_optimizer.update(mu_grad, mu_state, params=mu)
-            sigma_updates, new_sigma_state = sigma_optimizer.update(
-                sigma_grad, sigma_state, params=sigma)
-            new_mu = optax.apply_updates(mu, mu_updates)
-            new_sigma = optax.apply_updates(sigma, sigma_updates)
-            new_sigma = jax.tree_map(lambda x: jnp.clip(x, *sigma_range), new_sigma)
+            new_mu, new_sigma, new_mu_state, new_sigma_state = \
+                _jax_wrapped_pgpe_update_helper(mu, sigma, mu_grad, sigma_grad,
+                                                mu_state, sigma_state)
             # respect KL divergence contraint with old parameters
             if max_kl is not None:
@@ -1493,12 +1543,9 @@ class GaussianPGPE(PGPE):
                 kl_reduction = jnp.minimum(1.0, jnp.sqrt(max_kl / total_kl))
                 mu_state.hyperparams['learning_rate'] = old_mu_lr * kl_reduction
                 sigma_state.hyperparams['learning_rate'] = old_sigma_lr * kl_reduction
-                mu_updates, new_mu_state = mu_optimizer.update(mu_grad, mu_state, params=mu)
-                sigma_updates, new_sigma_state = sigma_optimizer.update(
-                    sigma_grad, sigma_state, params=sigma)
-                new_mu = optax.apply_updates(mu, mu_updates)
-                new_sigma = optax.apply_updates(sigma, sigma_updates)
-                new_sigma = jax.tree_map(lambda x: jnp.clip(x, *sigma_range), new_sigma)
+                new_mu, new_sigma, new_mu_state, new_sigma_state = \
+                _jax_wrapped_pgpe_update_helper(mu, sigma, mu_grad, sigma_grad,
+                                                mu_state, sigma_state)
                 new_mu_state.hyperparams['learning_rate'] = old_mu_lr
                 new_sigma_state.hyperparams['learning_rate'] = old_sigma_lr
@@ -1509,7 +1556,21 @@ class GaussianPGPE(PGPE):
             policy_params = new_mu
             return new_pgpe_params, new_r_max, new_pgpe_opt_state, policy_params, converged
-        self._update = jax.jit(_jax_wrapped_pgpe_update)
+        if parallel_updates is None:
+            self._update = jax.jit(_jax_wrapped_pgpe_update)
+        else:
+            # for parallel policy update
+            def _jax_wrapped_pgpe_updates(key, pgpe_params, r_max, progress,
+                                          policy_hyperparams, subs, model_params,
+                                          pgpe_opt_state):
+                keys = jnp.asarray(random.split(key, num=parallel_updates))
+                return jax.vmap(
+                    _jax_wrapped_pgpe_update, in_axes=(0, 0, 0, None, None, None, 0, 0)
+                )(keys, pgpe_params, r_max, progress, policy_hyperparams, subs,
+                  model_params, pgpe_opt_state)
+            self._update = jax.jit(_jax_wrapped_pgpe_updates)
 # ***********************************************************************
@@ -1565,6 +1626,7 @@ def cvar_utility(returns: jnp.ndarray, alpha: float) -> float:
     return jnp.sum(returns * weights)
+# set of all currently valid built-in utility functions
 UTILITY_LOOKUP = {
     'mean': jnp.mean,
     'mean_var': mean_variance_utility,
@@ -1609,7 +1671,8 @@ class JaxBackpropPlanner:
                  cpfs_without_grad: Optional[Set[str]]=None,
                  compile_non_fluent_exact: bool=True,
                  logger: Optional[Logger]=None,
-                 dashboard_viz: Optional[Any]=None) -> None:
+                 dashboard_viz: Optional[Any]=None,
+                 parallel_updates: Optional[int]=None) -> None:
         '''Creates a new gradient-based algorithm for optimizing action sequences
         (plan) in the given RDDL. Some operations will be converted to their
         differentiable counterparts; the specific operations can be customized
@@ -1649,6 +1712,7 @@ class JaxBackpropPlanner:
         :param logger: to log information about compilation to file
         :param dashboard_viz: optional visualizer object from the environment
         to pass to the dashboard to visualize the policy
+        :param parallel_updates: how many optimizers to run independently in parallel
         '''
         self.rddl = rddl
         self.plan = plan
@@ -1656,6 +1720,7 @@ class JaxBackpropPlanner:
         if batch_size_test is None:
             batch_size_test = batch_size_train
         self.batch_size_test = batch_size_test
+        self.parallel_updates = parallel_updates
         if rollout_horizon is None:
             rollout_horizon = rddl.horizon
         self.horizon = rollout_horizon
@@ -1677,10 +1742,11 @@ class JaxBackpropPlanner:
         try:
             optimizer = optax.inject_hyperparams(optimizer)(**optimizer_kwargs)
         except Exception as _:
-            raise_warning(
-                f'Failed to inject hyperparameters into optax optimizer {optimizer}, '
-                'rolling back to safer method: please note that modification of '
-                'optimizer hyperparameters will not work.', 'red')
+            message = termcolor.colored(
+                '[FAIL] Failed to inject hyperparameters into JaxPlan optimizer, '
+                'rolling back to safer method: please note that runtime modification of '
+                'hyperparameters will be disabled.', 'red')
+            print(message)
             optimizer = optimizer(**optimizer_kwargs)
         # apply optimizer chain of transformations
@@ -1700,7 +1766,7 @@ class JaxBackpropPlanner:
             utility_fn = UTILITY_LOOKUP.get(utility, None)
             if utility_fn is None:
                 raise RDDLNotImplementedError(
-                    f'Utility <{utility}> is not supported, '
+                    f'Utility function <{utility}> is not supported, '
                     f'must be one of {list(UTILITY_LOOKUP.keys())}.')
         else:
             utility_fn = utility
@@ -1742,7 +1808,7 @@ r"""
 \/_____/ \/_/\/_/ \/_/\/_/ \/_/    \/_____/ \/_/\/_/ \/_/ \/_/
 """
-        return ('\n'
+        return (f'\n'
                 f'{LOGO}\n'
                 f'Version {__version__}\n'
                 f'Python {sys.version}\n'
@@ -1751,7 +1817,23 @@ r"""
                 f'numpy {np.__version__}\n'
                 f'devices: {devices_short}\n')
-    def __str__(self) -> str:
+    def summarize_relaxations(self) -> str:
+        result = ''
+        if self.compiled.model_params:
+            result += ('Some RDDL operations are non-differentiable '
+                       'and will be approximated as follows:' + '\n')
+            exprs_by_rddl_op, values_by_rddl_op = {}, {}
+            for info in self.compiled.model_parameter_info().values():
+                rddl_op = info['rddl_op']
+                exprs_by_rddl_op.setdefault(rddl_op, []).append(info['id'])
+                values_by_rddl_op.setdefault(rddl_op, []).append(info['init_value'])
+            for rddl_op in sorted(exprs_by_rddl_op.keys()):
+                result += (f'    {rddl_op}:\n'
+                           f'        addresses  ={exprs_by_rddl_op[rddl_op]}\n'
+                           f'        init_values={values_by_rddl_op[rddl_op]}\n')
+        return result
+    def summarize_hyperparameters(self) -> str:
         result = (f'objective hyper-parameters:\n'
                   f'    utility_fn        ={self.utility.__name__}\n'
                   f'    utility args      ={self.utility_kwargs}\n'
@@ -1769,30 +1851,14 @@ r"""
                   f'    line_search_kwargs={self.line_search_kwargs}\n'
                   f'    noise_kwargs      ={self.noise_kwargs}\n'
                   f'    batch_size_train  ={self.batch_size_train}\n'
-                  f'    batch_size_test   ={self.batch_size_test}\n')
+                  f'    batch_size_test   ={self.batch_size_test}\n'
+                  f'    parallel_updates  ={self.parallel_updates}\n')
         result += str(self.plan)
         if self.use_pgpe:
             result += str(self.pgpe)
         result += str(self.logic)
-        # print model relaxation information
-        if self.compiled.model_params:
-            result += ('Some RDDL operations are non-differentiable '
-                       'and will be approximated as follows:' + '\n')
-            exprs_by_rddl_op, values_by_rddl_op = {}, {}
-            for info in self.compiled.model_parameter_info().values():
-                rddl_op = info['rddl_op']
-                exprs_by_rddl_op.setdefault(rddl_op, []).append(info['id'])
-                values_by_rddl_op.setdefault(rddl_op, []).append(info['init_value'])
-            for rddl_op in sorted(exprs_by_rddl_op.keys()):
-                result += (f'    {rddl_op}:\n'
-                           f'        addresses  ={exprs_by_rddl_op[rddl_op]}\n'
-                           f'        init_values={values_by_rddl_op[rddl_op]}\n')
         return result
-    def summarize_hyperparameters(self) -> None:
-        print(self.__str__())
     # ===========================================================================
     # COMPILATION SUBROUTINES
     # ===========================================================================
@@ -1844,23 +1910,31 @@ r"""
         self.test_rollouts = jax.jit(test_rollouts)
         # initialization
-        self.initialize = jax.jit(self._jax_init())
+        self.initialize, self.init_optimizer = self._jax_init()
         # losses
         train_loss = self._jax_loss(train_rollouts, use_symlog=self.use_symlog_reward)
-        self.test_loss = jax.jit(self._jax_loss(test_rollouts, use_symlog=False))
+        test_loss = self._jax_loss(test_rollouts, use_symlog=False)
+        if self.parallel_updates is None:
+            self.test_loss = jax.jit(test_loss)
+        else:
+            self.test_loss = jax.jit(jax.vmap(test_loss, in_axes=(None, 0, None, None, 0)))
         # optimization
         self.update = self._jax_update(train_loss)
+        self.pytree_at = jax.jit(lambda tree, i: jax.tree_map(lambda x: x[i], tree))
         # pgpe option
         if self.use_pgpe:
-            loss_fn = self._jax_loss(rollouts=test_rollouts)
             self.pgpe.compile(
-                loss_fn=loss_fn,
+                loss_fn=test_loss,
                 projection=self.plan.projection,
-                real_dtype=self.test_compiled.REAL
+                real_dtype=self.test_compiled.REAL,
+                parallel_updates=self.parallel_updates
             )
+            self.merge_pgpe = self._jax_merge_pgpe_jaxplan()
+        else:
+            self.merge_pgpe = None
     def _jax_return(self, use_symlog):
         gamma = self.rddl.discount
@@ -1900,24 +1974,43 @@ r"""
     def _jax_init(self):
         init = self.plan.initializer
         optimizer = self.optimizer
+        num_parallel = self.parallel_updates
         # initialize both the policy and its optimizer
         def _jax_wrapped_init_policy(key, policy_hyperparams, subs):
             policy_params = init(key, policy_hyperparams, subs)
             opt_state = optimizer.init(policy_params)
-            return policy_params, opt_state, {}
+            return policy_params, opt_state, {}
-        return _jax_wrapped_init_policy
+        # initialize just the optimizer from the policy
+        def _jax_wrapped_init_opt(policy_params):
+            if num_parallel is None:
+                opt_state = optimizer.init(policy_params)
+            else:
+                opt_state = jax.vmap(optimizer.init, in_axes=0)(policy_params)
+            return opt_state, {}
+        if num_parallel is None:
+            return jax.jit(_jax_wrapped_init_policy), jax.jit(_jax_wrapped_init_opt)
+        # for parallel policy update
+        def _jax_wrapped_init_policies(key, policy_hyperparams, subs):
+            keys = jnp.asarray(random.split(key, num=num_parallel))
+            return jax.vmap(_jax_wrapped_init_policy, in_axes=(0, None, None))(
+                keys, policy_hyperparams, subs)
+        return jax.jit(_jax_wrapped_init_policies), jax.jit(_jax_wrapped_init_opt)
     def _jax_update(self, loss):
         optimizer = self.optimizer
         projection = self.plan.projection
         use_ls = self.line_search_kwargs is not None
+        num_parallel = self.parallel_updates
         # check if the gradients are all zeros
         def _jax_wrapped_zero_gradients(grad):
             leaves, _ = jax.tree_util.tree_flatten(
-                jax.tree_map(lambda g: jnp.allclose(g, 0), grad))
+                jax.tree_map(partial(jnp.allclose, b=0), grad))
             return jnp.all(jnp.asarray(leaves))
         # calculate the plan gradient w.r.t. return loss and update optimizer
@@ -1948,8 +2041,43 @@ r"""
             return policy_params, converged, opt_state, opt_aux, \
                 loss_val, log, model_params, zero_grads
-        return jax.jit(_jax_wrapped_plan_update)
+        if num_parallel is None:
+            return jax.jit(_jax_wrapped_plan_update)
+        # for parallel policy update
+        def _jax_wrapped_plan_updates(key, policy_params, policy_hyperparams,
+                                      subs, model_params, opt_state, opt_aux):
+            keys = jnp.asarray(random.split(key, num=num_parallel))
+            return jax.vmap(
+                _jax_wrapped_plan_update, in_axes=(0, 0, None, None, 0, 0, 0)
+            )(keys, policy_params, policy_hyperparams, subs, model_params,
+              opt_state, opt_aux)
+        return jax.jit(_jax_wrapped_plan_updates)
+    def _jax_merge_pgpe_jaxplan(self):
+        if self.parallel_updates is None:
+            return None
+        # for parallel policy update
+        # currently implements a hard replacement where the jaxplan parameter
+        # is replaced by the PGPE parameter if the latter is an improvement
+        def _jax_wrapped_pgpe_jaxplan_merge(pgpe_mask, pgpe_param, policy_params,
+                                            pgpe_loss, test_loss,
+                                            pgpe_loss_smooth, test_loss_smooth,
+                                            pgpe_converged, converged):
+            def select_fn(leaf1, leaf2):
+                expanded_mask = pgpe_mask[(...,) + (jnp.newaxis,) * (jnp.ndim(leaf1) - 1)]
+                return jnp.where(expanded_mask, leaf1, leaf2)
+            policy_params = jax.tree_map(select_fn, pgpe_param, policy_params)
+            test_loss = jnp.where(pgpe_mask, pgpe_loss, test_loss)
+            test_loss_smooth = jnp.where(pgpe_mask, pgpe_loss_smooth, test_loss_smooth)
+            expanded_mask = pgpe_mask[(...,) + (jnp.newaxis,) * (jnp.ndim(converged) - 1)]
+            converged = jnp.where(expanded_mask, pgpe_converged, converged)
+            return policy_params, test_loss, test_loss_smooth, converged
+        return jax.jit(_jax_wrapped_pgpe_jaxplan_merge)
     def _batched_init_subs(self, subs):
         rddl = self.rddl
         n_train, n_test = self.batch_size_train, self.batch_size_test
@@ -1968,6 +2096,13 @@ r"""
             train_value = np.asarray(train_value, dtype=self.compiled.REAL)
             init_train[name] = train_value
             init_test[name] = np.repeat(value, repeats=n_test, axis=0)
+            # safely cast test subs variable to required type in case the type is wrong
+            if name in rddl.variable_ranges:
+                required_type = RDDLValueInitializer.NUMPY_TYPES.get(
+                    rddl.variable_ranges[name], RDDLValueInitializer.INT)
+                if np.result_type(init_test[name]) != required_type:
+                    init_test[name] = np.asarray(init_test[name], dtype=required_type)
         # make sure next-state fluents are also set
         for (state, next_state) in rddl.next_state.items():
@@ -1975,6 +2110,19 @@ r"""
             init_test[next_state] = init_test[state]
         return init_train, init_test
+    def _broadcast_pytree(self, pytree):
+        if self.parallel_updates is None:
+            return pytree
+        # for parallel policy update
+        def make_batched(x):
+            x = np.asarray(x)
+            x = np.broadcast_to(
+                x[np.newaxis, ...], shape=(self.parallel_updates,) + np.shape(x))
+            return x
+        return jax.tree_map(make_batched, pytree)
     def as_optimization_problem(
             self, key: Optional[random.PRNGKey]=None,
             policy_hyperparams: Optional[Pytree]=None,
@@ -2002,6 +2150,11 @@ r"""
         :param grad_function_updates_key: if True, the gradient function
         updates the PRNG key internally independently of the loss function.
         '''
+        # make sure parallel updates are disabled
+        if self.parallel_updates is not None:
+            raise ValueError('Cannot compile static optimization problem '
+                             'when parallel_updates is not None.')
         # if PRNG key is not provided
         if key is None:
@@ -2012,8 +2165,10 @@ r"""
         train_subs, _ = self._batched_init_subs(subs)
         model_params = self.compiled.model_params
         if policy_hyperparams is None:
-            raise_warning('policy_hyperparams is not set, setting 1.0 for '
-                          'all action-fluents which could be suboptimal.')
+            message = termcolor.colored(
+                '[WARN] policy_hyperparams is not set, setting 1.0 for '
+                'all action-fluents which could be suboptimal.', 'yellow')
+            print(message)
             policy_hyperparams = {action: 1.0
                                   for action in self.rddl.action_fluents}
@@ -2084,10 +2239,12 @@ r"""
         their values: if None initializes all variables from the RDDL instance
         :param guess: initial policy parameters: if None will use the initializer
         specified in this instance
-        :param print_summary: whether to print planner header, parameter
-        summary, and diagnosis
+        :param print_summary: whether to print planner header and diagnosis
         :param print_progress: whether to print the progress bar during training
+        :param print_hyperparams: whether to print list of hyper-parameter settings
         :param stopping_rule: stopping criterion
+        :param restart_epochs: restart the optimizer from a random policy configuration
+        if there is no progress for this many consecutive iterations
         :param test_rolling_window: the test return is averaged on a rolling
         window of the past test_rolling_window returns when updating the best
         parameters found so far
@@ -2120,7 +2277,9 @@ r"""
                            guess: Optional[Pytree]=None,
                            print_summary: bool=True,
                            print_progress: bool=True,
+                           print_hyperparams: bool=False,
                            stopping_rule: Optional[JaxPlannerStoppingRule]=None,
+                           restart_epochs: int=999999,
                            test_rolling_window: int=10,
                            tqdm_position: Optional[int]=None) -> Generator[Dict[str, Any], None, None]:
         '''Returns a generator for computing an optimal policy or plan.
@@ -2139,10 +2298,12 @@ r"""
         their values: if None initializes all variables from the RDDL instance
         :param guess: initial policy parameters: if None will use the initializer
         specified in this instance
-        :param print_summary: whether to print planner header, parameter
-        summary, and diagnosis
+        :param print_summary: whether to print planner header and diagnosis
         :param print_progress: whether to print the progress bar during training
+        :param print_hyperparams: whether to print list of hyper-parameter settings
         :param stopping_rule: stopping criterion
+        :param restart_epochs: restart the optimizer from a random policy configuration
+        if there is no progress for this many consecutive iterations
         :param test_rolling_window: the test return is averaged on a rolling
         window of the past test_rolling_window returns when updating the best
         parameters found so far
@@ -2155,6 +2316,14 @@ r"""
         # INITIALIZATION OF HYPER-PARAMETERS
         # ======================================================================
+        # cannot run dashboard with parallel updates
+        if dashboard is not None and self.parallel_updates is not None:
+            message = termcolor.colored(
+                '[WARN] Dashboard is unavailable if parallel_updates is not None: '
+                'setting dashboard to None.', 'yellow')
+            print(message)
+            dashboard = None
         # if PRNG key is not provided
         if key is None:
             key = random.PRNGKey(round(time.time() * 1000))
@@ -2162,15 +2331,19 @@ r"""
         # if policy_hyperparams is not provided
         if policy_hyperparams is None:
-            raise_warning('policy_hyperparams is not set, setting 1.0 for '
-                          'all action-fluents which could be suboptimal.')
+            message = termcolor.colored(
+                '[WARN] policy_hyperparams is not set, setting 1.0 for '
+                'all action-fluents which could be suboptimal.', 'yellow')
+            print(message)
             policy_hyperparams = {action: 1.0
                                   for action in self.rddl.action_fluents}
         # if policy_hyperparams is a scalar
         elif isinstance(policy_hyperparams, (int, float, np.number)):
-            raise_warning(f'policy_hyperparams is {policy_hyperparams}, '
-                          'setting this value for all action-fluents.')
+            message = termcolor.colored(
+                f'[INFO] policy_hyperparams is {policy_hyperparams}, '
+                f'setting this value for all action-fluents.', 'green')
+            print(message)
             hyperparam_value = float(policy_hyperparams)
             policy_hyperparams = {action: hyperparam_value
                                   for action in self.rddl.action_fluents}
@@ -2179,14 +2352,19 @@ r"""
         elif isinstance(policy_hyperparams, dict):
             for action in self.rddl.action_fluents:
                 if action not in policy_hyperparams:
-                    raise_warning(f'policy_hyperparams[{action}] is not set, '
-                                  'setting 1.0 which could be suboptimal.')
+                    message = termcolor.colored(
+                        f'[WARN] policy_hyperparams[{action}] is not set, '
+                        f'setting 1.0 for missing action-fluents '
+                        f'which could be suboptimal.', 'yellow')
+                    print(message)
                     policy_hyperparams[action] = 1.0
         # print summary of parameters:
         if print_summary:
             print(self.summarize_system())
-            self.summarize_hyperparameters()
+            print(self.summarize_relaxations())
+        if print_hyperparams:
+            print(self.summarize_hyperparameters())
             print(f'optimize() call hyper-parameters:\n'
                   f'    PRNG key           ={key}\n'
                   f'    max_iterations     ={epochs}\n'
@@ -2200,7 +2378,8 @@ r"""
                   f'    dashboard_id       ={dashboard_id}\n'
                   f'    print_summary      ={print_summary}\n'
                   f'    print_progress     ={print_progress}\n'
-                  f'    stopping_rule      ={stopping_rule}\n')
+                  f'    stopping_rule      ={stopping_rule}\n'
+                  f'    restart_epochs     ={restart_epochs}\n')
         # ======================================================================
         # INITIALIZATION OF STATE AND POLICY
@@ -2218,15 +2397,17 @@ r"""
                     subs[var] = value
                     added_pvars_to_subs.append(var)
             if added_pvars_to_subs:
-                raise_warning(f'p-variables {added_pvars_to_subs} not in '
-                              'provided subs, using their initial values '
-                              'from the RDDL files.')
+                message = termcolor.colored(
+                    f'[INFO] p-variables {added_pvars_to_subs} is not in '
+                    f'provided subs, using their initial values.', 'green')
+                print(message)
         train_subs, test_subs = self._batched_init_subs(subs)
         # initialize model parameters
         if model_params is None:
             model_params = self.compiled.model_params
-        model_params_test = self.test_compiled.model_params
+        model_params = self._broadcast_pytree(model_params)
+        model_params_test = self._broadcast_pytree(self.test_compiled.model_params)
         # initialize policy parameters
         if guess is None:
@@ -2234,29 +2415,31 @@ r"""
             policy_params, opt_state, opt_aux = self.initialize(
                 subkey, policy_hyperparams, train_subs)
         else:
-            policy_params = guess
-            opt_state = self.optimizer.init(policy_params)
-            opt_aux = {}
+            policy_params = self._broadcast_pytree(guess)
+            opt_state, opt_aux = self.init_optimizer(policy_params)
         # initialize pgpe parameters
         if self.use_pgpe:
-            pgpe_params, pgpe_opt_state = self.pgpe.initialize(key, policy_params)
+            pgpe_params, pgpe_opt_state, r_max = self.pgpe.initialize(key, policy_params)
             rolling_pgpe_loss = RollingMean(test_rolling_window)
         else:
-            pgpe_params, pgpe_opt_state = None, None
+            pgpe_params, pgpe_opt_state, r_max = None, None, None
             rolling_pgpe_loss = None
         total_pgpe_it = 0
-        r_max = -jnp.inf
         # ======================================================================
         # INITIALIZATION OF RUNNING STATISTICS
         # ======================================================================
         # initialize running statistics
-        best_params, best_loss, best_grad = policy_params, jnp.inf, None
+        if self.parallel_updates is None:
+            best_params = policy_params
+        else:
+            best_params = self.pytree_at(policy_params, 0)
+        best_loss, pbest_loss, best_grad = np.inf, np.inf, None
         last_iter_improve = 0
+        no_progress_count = 0
         rolling_test_loss = RollingMean(test_rolling_window)
-        log = {}
         status = JaxPlannerStatus.NORMAL
         progress_percent = 0
@@ -2277,6 +2460,11 @@ r"""
         else:
             progress_bar = None
         position_str = '' if tqdm_position is None else f'[{tqdm_position}]'
+        # error handlers (to avoid spam messaging)
+        policy_constraint_msg_shown = False
+        jax_train_msg_shown = False
+        jax_test_msg_shown = False
         # ======================================================================
         # MAIN TRAINING LOOP BEGINS
@@ -2296,8 +2484,13 @@ r"""
              model_params, zero_grads) = self.update(
                  subkey, policy_params, policy_hyperparams, train_subs, model_params,
                  opt_state, opt_aux)
+            # evaluate
             test_loss, (test_log, model_params_test) = self.test_loss(
                 subkey, policy_params, policy_hyperparams, test_subs, model_params_test)
+            if self.parallel_updates:
+                train_loss = np.asarray(train_loss)
+                test_loss = np.asarray(test_loss)
             test_loss_smooth = rolling_test_loss.update(test_loss)
             # pgpe update of the plan
@@ -2308,52 +2501,112 @@ r"""
                     self.pgpe.update(subkey, pgpe_params, r_max, progress_percent,
                                      policy_hyperparams, test_subs, model_params_test,
                                      pgpe_opt_state)
+                # evaluate
                 pgpe_loss, _ = self.test_loss(
                     subkey, pgpe_param, policy_hyperparams, test_subs, model_params_test)
+                if self.parallel_updates:
+                    pgpe_loss = np.asarray(pgpe_loss)
                 pgpe_loss_smooth = rolling_pgpe_loss.update(pgpe_loss)
                 pgpe_return = -pgpe_loss_smooth
-                # replace with PGPE if it reaches a new minimum or train loss invalid
-                if pgpe_loss_smooth < best_loss or not np.isfinite(train_loss):
-                    policy_params = pgpe_param
-                    test_loss, test_loss_smooth = pgpe_loss, pgpe_loss_smooth
-                    converged = pgpe_converged
-                    pgpe_improve = True
-                    total_pgpe_it += 1
+                # replace JaxPlan with PGPE if new minimum reached or train loss invalid
+                if self.parallel_updates is None:
+                    if pgpe_loss_smooth < best_loss or not np.isfinite(train_loss):
+                        policy_params = pgpe_param
+                        test_loss, test_loss_smooth = pgpe_loss, pgpe_loss_smooth
+                        converged = pgpe_converged
+                        pgpe_improve = True
+                        total_pgpe_it += 1
+                else:
+                    pgpe_mask = (pgpe_loss_smooth < pbest_loss) | ~np.isfinite(train_loss)
+                    if np.any(pgpe_mask):
+                        policy_params, test_loss, test_loss_smooth, converged = \
+                            self.merge_pgpe(pgpe_mask, pgpe_param, policy_params,
+                                            pgpe_loss, test_loss,
+                                            pgpe_loss_smooth, test_loss_smooth,
+                                            pgpe_converged, converged)
+                        pgpe_improve = True
+                        total_pgpe_it += 1
             else:
                 pgpe_loss, pgpe_loss_smooth, pgpe_return = None, None, None
-            # evaluate test losses and record best plan so far
-            if test_loss_smooth < best_loss:
-                best_params, best_loss, best_grad = \
-                    policy_params, test_loss_smooth, train_log['grad']
-                last_iter_improve = it
+            # evaluate test losses and record best parameters so far
+            if self.parallel_updates is None:
+                if test_loss_smooth < best_loss:
+                    best_params, best_loss, best_grad = \
+                        policy_params, test_loss_smooth, train_log['grad']
+                    pbest_loss = best_loss
+            else:
+                best_index = np.argmin(test_loss_smooth)
+                if test_loss_smooth[best_index] < best_loss:
+                    best_params = self.pytree_at(policy_params, best_index)
+                    best_grad = self.pytree_at(train_log['grad'], best_index)
+                    best_loss = test_loss_smooth[best_index]
+                pbest_loss = np.minimum(pbest_loss, test_loss_smooth)
             # ==================================================================
             # STATUS CHECKS AND LOGGING
             # ==================================================================
             # no progress
-            if (not pgpe_improve) and zero_grads:
+            no_progress_flag = (not pgpe_improve) and np.all(zero_grads)
+            if no_progress_flag:
                 status = JaxPlannerStatus.NO_PROGRESS
             # constraint satisfaction problem
-            if not np.all(converged):
-                raise_warning(
-                    'Projected gradient method for satisfying action concurrency '
-                    'constraints reached the iteration limit: plan is possibly '
-                    'invalid for the current instance.', 'red')
+            if not np.all(converged):
+                if progress_bar is not None and not policy_constraint_msg_shown:
+                    message = termcolor.colored(
+                        '[FAIL] Policy update failed to satisfy action constraints.',
+                        'red')
+                    progress_bar.write(message)
+                    policy_constraint_msg_shown = True
                 status = JaxPlannerStatus.PRECONDITION_POSSIBLY_UNSATISFIED
             # numerical error
             if self.use_pgpe:
-                invalid_loss = not (np.isfinite(train_loss) or np.isfinite(pgpe_loss))
+                invalid_loss = not (np.any(np.isfinite(train_loss)) or
+                                    np.any(np.isfinite(pgpe_loss)))
             else:
-                invalid_loss = not np.isfinite(train_loss)
+                invalid_loss = not np.any(np.isfinite(train_loss))
             if invalid_loss:
-                raise_warning(f'Planner aborted due to invalid loss {train_loss}.', 'red')
+                if progress_bar is not None:
+                    message = termcolor.colored(
+                        f'[FAIL] Planner aborted due to invalid train loss {train_loss}.',
+                        'red')
+                    progress_bar.write(message)
                 status = JaxPlannerStatus.INVALID_GRADIENT
+            # problem in the model compilation
+            if progress_bar is not None:
+                # train model
+                if not jax_train_msg_shown:
+                    messages = set()
+                    for error_code in np.unique(train_log['error']):
+                        messages.update(JaxRDDLCompiler.get_error_messages(error_code))
+                    if messages:
+                        messages = '\n    '.join(messages)
+                        message = termcolor.colored(
+                            f'[FAIL] Compiler encountered the following '
+                            f'error(s) in the training model:\n    {messages}', 'red')
+                    progress_bar.write(message)
+                    jax_train_msg_shown = True
+                # test model
+                if not jax_test_msg_shown:
+                    messages = set()
+                    for error_code in np.unique(test_log['error']):
+                        messages.update(JaxRDDLCompiler.get_error_messages(error_code))
+                    if messages:
+                        messages = '\n    '.join(messages)
+                        message = termcolor.colored(
+                            f'[FAIL] Compiler encountered the following '
+                            f'error(s) in the testing model:\n    {messages}', 'red')
+                        progress_bar.write(message)
+                        jax_test_msg_shown = True
             # reached computation budget
             elapsed = time.time() - start_time - elapsed_outside_loop
             if elapsed >= train_seconds:
@@ -2387,20 +2640,39 @@ r"""
                 **test_log
             }
+            # hard restart
+            if guess is None and no_progress_flag:
+                no_progress_count += 1
+                if no_progress_count > restart_epochs:
+                    key, subkey = random.split(key)
+                    policy_params, opt_state, opt_aux = self.initialize(
+                        subkey, policy_hyperparams, train_subs)
+                    no_progress_count = 0
+                    if progress_bar is not None:
+                        message = termcolor.colored(
+                            f'[INFO] Optimizer restarted at iteration {it} '
+                            f'due to lack of progress.', 'green')
+                        progress_bar.write(message)
+            else:
+                no_progress_count = 0
             # stopping condition reached
             if stopping_rule is not None and stopping_rule.monitor(callback):
+                if progress_bar is not None:
+                    message = termcolor.colored(
+                        '[SUCC] Stopping rule has been reached.', 'green')
+                    progress_bar.write(message)
                 callback['status'] = status = JaxPlannerStatus.STOPPING_RULE_REACHED
             # if the progress bar is used
             if print_progress:
                 progress_bar.set_description(
-                    f'{position_str} {it:6} it / {-train_loss:14.5f} train / '
-                    f'{-test_loss_smooth:14.5f} test / {-best_loss:14.5f} best / '
+                    f'{position_str} {it:6} it / {-np.min(train_loss):14.5f} train / '
+                    f'{-np.min(test_loss_smooth):14.5f} test / {-best_loss:14.5f} best / '
                     f'{status.value} status / {total_pgpe_it:6} pgpe',
-                    refresh=False
-                )
+                    refresh=False)
                 progress_bar.set_postfix_str(
-                    f"{(it + 1) / (elapsed + 1e-6):.2f}it/s", refresh=False)
+                    f'{(it + 1) / (elapsed + 1e-6):.2f}it/s', refresh=False)
                 progress_bar.update(progress_percent - progress_bar.n)
             # dash-board
@@ -2423,24 +2695,15 @@ r"""
         # release resources
         if print_progress:
             progress_bar.close()
-        # validate the test return
-        if log:
-            messages = set()
-            for error_code in np.unique(log['error']):
-                messages.update(JaxRDDLCompiler.get_error_messages(error_code))
-            if messages:
-                messages = '\n'.join(messages)
-                raise_warning('JAX compiler encountered the following '
-                              'error(s) in the original RDDL formulation '
-                              f'during test evaluation:\n{messages}', 'red')
+            print()
         # summarize and test for convergence
         if print_summary:
             grad_norm = jax.tree_map(lambda x: np.linalg.norm(x).item(), best_grad)
             diagnosis = self._perform_diagnosis(
-                last_iter_improve, -train_loss, -test_loss_smooth, -best_loss, grad_norm)
-            print(f'summary of optimization:\n'
+                last_iter_improve, -np.min(train_loss), -np.min(test_loss_smooth),
+                -best_loss, grad_norm)
+            print(f'Summary of optimization:\n'
                   f'    status        ={status}\n'
                   f'    time          ={elapsed:.3f} sec.\n'
                   f'    iterations    ={it}\n'
@@ -2453,12 +2716,9 @@ r"""
         max_grad_norm = max(jax.tree_util.tree_leaves(grad_norm))
         grad_is_zero = np.allclose(max_grad_norm, 0)
-        validation_error = 100 * abs(test_return - train_return) / \
-                            max(abs(train_return), abs(test_return))
         # divergence if the solution is not finite
         if not np.isfinite(train_return):
-            return termcolor.colored('[FAILURE] training loss diverged.', 'red')
+            return termcolor.colored('[FAIL] Training loss diverged.', 'red')
         # hit a plateau is likely IF:
         # 1. planner does not improve at all
@@ -2466,23 +2726,25 @@ r"""
         if last_iter_improve <= 1:
             if grad_is_zero:
                 return termcolor.colored(
-                    '[FAILURE] no progress was made '
+                    f'[FAIL] No progress was made '
                     f'and max grad norm {max_grad_norm:.6f} was zero: '
-                    'solver likely stuck in a plateau.', 'red')
+                    f'solver likely stuck in a plateau.', 'red')
             else:
                 return termcolor.colored(
-                    '[FAILURE] no progress was made '
+                    f'[FAIL] No progress was made '
                     f'but max grad norm {max_grad_norm:.6f} was non-zero: '
-                    'learning rate or other hyper-parameters likely suboptimal.',
+                    f'learning rate or other hyper-parameters could be suboptimal.',
                     'red')
         # model is likely poor IF:
         # 1. the train and test return disagree
+        validation_error = 100 * abs(test_return - train_return) / \
+                            max(abs(train_return), abs(test_return))
         if not (validation_error < 20):
             return termcolor.colored(
-                '[WARNING] progress was made '
+                f'[WARN] Progress was made '
                 f'but relative train-test error {validation_error:.6f} was high: '
-                'poor model relaxation around solution or batch size too small.',
+                f'poor model relaxation around solution or batch size too small.',
                 'yellow')
         # model likely did not converge IF:
@@ -2491,24 +2753,22 @@ r"""
             return_to_grad_norm = abs(best_return) / max_grad_norm
             if not (return_to_grad_norm > 1):
                 return termcolor.colored(
-                    '[WARNING] progress was made '
+                    f'[WARN] Progress was made '
                     f'but max grad norm {max_grad_norm:.6f} was high: '
-                    'solution locally suboptimal '
-                    'or relaxed model not smooth around solution '
-                    'or batch size too small.', 'yellow')
+                    f'solution locally suboptimal, relaxed model nonsmooth around solution, '
+                    f'or batch size too small.', 'yellow')
         # likely successful
         return termcolor.colored(
-            '[SUCCESS] solver converged successfully '
-            '(note: not all potential problems can be ruled out).', 'green')
+            '[SUCC] Planner converged successfully '
+            '(note: not all problems can be ruled out).', 'green')
     def get_action(self, key: random.PRNGKey,
                    params: Pytree,
                    step: int,
                    subs: Dict[str, Any],
                    policy_hyperparams: Optional[Dict[str, Any]]=None) -> Dict[str, Any]:
-        '''Returns an action dictionary from the policy or plan with the given
-        parameters.
+        '''Returns an action dictionary from the policy or plan with the given parameters.
         :param key: the JAX PRNG key
         :param params: the trainable parameter PyTree of the policy
@@ -2612,8 +2872,7 @@ class JaxOfflineController(BaseAgent):
 class JaxOnlineController(BaseAgent):
-    '''A container class for a Jax controller continuously updated using state
-    feedback.'''
+    '''A container class for a Jax controller continuously updated using state feedback.'''
     use_tensor_obs = True
@@ -2621,17 +2880,19 @@ class JaxOnlineController(BaseAgent):
                  key: Optional[random.PRNGKey]=None,
                  eval_hyperparams: Optional[Dict[str, Any]]=None,
                  warm_start: bool=True,
+                 max_attempts: int=3,
                  **train_kwargs) -> None:
         '''Creates a new JAX control policy that is trained online in a closed-
         loop fashion.
         :param planner: underlying planning algorithm for optimizing actions
-        :param key: the RNG key to seed randomness (derives from clock if not
-        provided)
+        :param key: the RNG key to seed randomness (derives from clock if not provided)
         :param eval_hyperparams: policy hyperparameters to apply for evaluation
         or whenever sample_action is called
         :param warm_start: whether to use the previous decision epoch final
         policy parameters to warm the next decision epoch
+        :param max_attempts: maximum attempted restarts of the optimizer when the total
+        iteration count is 1 (i.e. the execution time is dominated by the jit compilation)
         :param **train_kwargs: any keyword arguments to be passed to the planner
         for optimization
         '''
@@ -2642,16 +2903,26 @@ class JaxOnlineController(BaseAgent):
         self.eval_hyperparams = eval_hyperparams
         self.warm_start = warm_start
         self.train_kwargs = train_kwargs
+        self.max_attempts = max_attempts
         self.reset()
     def sample_action(self, state: Dict[str, Any]) -> Dict[str, Any]:
         planner = self.planner
         callback = planner.optimize(
-            key=self.key,
-            guess=self.guess,
-            subs=state,
-            **self.train_kwargs
-        )
+            key=self.key, guess=self.guess, subs=state, **self.train_kwargs)
+        # optimize again if jit compilation takes up the entire time budget
+        attempts = 0
+        while attempts < self.max_attempts and callback['iteration'] <= 1:
+            attempts += 1
+            message = termcolor.colored(
+                f'[WARN] JIT compilation dominated the execution time: '
+                f'executing the optimizer again on the traced model [attempt {attempts}].',
+                'yellow')
+            print(message)
+            callback = planner.optimize(
+                key=self.key, guess=self.guess, subs=state, **self.train_kwargs)
         self.callback = callback
         params = callback['best_params']
         self.key, subkey = random.split(self.key)

pyRDDLGym-jax 2.3__py3-none-any.whl → 2.4__py3-none-any.whl

pyRDDLGym-jax 2.3py3-none-any.whl → 2.4py3-none-any.whl