PyPI - pyRDDLGym-jax - Versions diffs - 0.2__py3-none-any.whl → 0.4__py3-none-any.whl - Mend

pyRDDLGym-jax 0.2py3-none-any.whl → 0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

pyRDDLGym_jax/__init__.py +1 -0
pyRDDLGym_jax/core/compiler.py +90 -68
pyRDDLGym_jax/core/logic.py +188 -46
pyRDDLGym_jax/core/planner.py +411 -195
pyRDDLGym_jax/core/simulator.py +2 -1
pyRDDLGym_jax/core/tuning.py +13 -10
pyRDDLGym_jax/examples/configs/HVAC_ippc2023_drp.cfg +3 -3
pyRDDLGym_jax/examples/configs/MarsRover_ippc2023_drp.cfg +1 -0
pyRDDLGym_jax/examples/configs/Pendulum_gym_slp.cfg +1 -1
pyRDDLGym_jax/examples/configs/default_drp.cfg +1 -1
pyRDDLGym_jax/examples/configs/default_slp.cfg +1 -1
pyRDDLGym_jax/examples/run_gym.py +2 -5
pyRDDLGym_jax/examples/run_plan.py +6 -8
pyRDDLGym_jax/examples/run_scipy.py +61 -0
pyRDDLGym_jax/examples/run_tune.py +5 -6
pyRDDLGym_jax-0.4.dist-info/METADATA +276 -0
{pyRDDLGym_jax-0.2.dist-info → pyRDDLGym_jax-0.4.dist-info}/RECORD +20 -22
{pyRDDLGym_jax-0.2.dist-info → pyRDDLGym_jax-0.4.dist-info}/WHEEL +1 -1
pyRDDLGym_jax/examples/configs/Pong_slp.cfg +0 -18
pyRDDLGym_jax/examples/configs/SupplyChain_slp.cfg +0 -18
pyRDDLGym_jax/examples/configs/Traffic_slp.cfg +0 -20
pyRDDLGym_jax-0.2.dist-info/METADATA +0 -26
{pyRDDLGym_jax-0.2.dist-info → pyRDDLGym_jax-0.4.dist-info}/LICENSE +0 -0
{pyRDDLGym_jax-0.2.dist-info → pyRDDLGym_jax-0.4.dist-info}/top_level.txt +0 -0

pyRDDLGym_jax/core/planner.py CHANGED Viewed

@@ -1,53 +1,52 @@
-__version__ = '0.2'
 from ast import literal_eval
 from collections import deque
 import configparser
 from enum import Enum
+import os
+import sys
+import time
+import traceback
+from typing import Any, Callable, Dict, Generator, Optional, Set, Sequence, Tuple, Union
 import haiku as hk
 import jax
+import jax.nn.initializers as initializers
 import jax.numpy as jnp
 import jax.random as random
-import jax.nn.initializers as initializers
 import numpy as np
 import optax
-import os
-import sys
 import termcolor
-import time
 from tqdm import tqdm
-from typing import Any, Callable, Dict, Generator, Optional, Set, Sequence, Tuple, Union
-Activation = Callable[[jnp.ndarray], jnp.ndarray]
-Bounds = Dict[str, Tuple[np.ndarray, np.ndarray]]
-Kwargs = Dict[str, Any]
-Pytree = Any
-from pyRDDLGym.core.debug.exception import raise_warning
-# try to import matplotlib, if failed then skip plotting
-try:
-    import matplotlib
-    import matplotlib.pyplot as plt
-    matplotlib.use('TkAgg')
-except Exception:
-    raise_warning('matplotlib is not installed, '
-                  'plotting functionality is disabled.', 'red')
-    plt = None
 from pyRDDLGym.core.compiler.model import RDDLPlanningModel, RDDLLiftedModel
 from pyRDDLGym.core.debug.logger import Logger
 from pyRDDLGym.core.debug.exception import (
+    raise_warning,
     RDDLNotImplementedError,
     RDDLUndefinedVariableError,
     RDDLTypeError
 )
 from pyRDDLGym.core.policy import BaseAgent
-from pyRDDLGym_jax.core.compiler import JaxRDDLCompiler
+from pyRDDLGym_jax import __version__
 from pyRDDLGym_jax.core import logic
+from pyRDDLGym_jax.core.compiler import JaxRDDLCompiler
 from pyRDDLGym_jax.core.logic import FuzzyLogic
+# try to import matplotlib, if failed then skip plotting
+try:
+    import matplotlib.pyplot as plt
+except Exception:
+    raise_warning('failed to import matplotlib: '
+                  'plotting functionality will be disabled.', 'red')
+    traceback.print_exc()
+    plt = None
+Activation = Callable[[jnp.ndarray], jnp.ndarray]
+Bounds = Dict[str, Tuple[np.ndarray, np.ndarray]]
+Kwargs = Dict[str, Any]
+Pytree = Any
 # ***********************************************************************
 # CONFIG FILE MANAGEMENT
@@ -102,9 +101,12 @@ def _load_config(config, args):
     comp_kwargs = model_args.get('complement_kwargs', {})
     compare_name = model_args.get('comparison', 'SigmoidComparison')
     compare_kwargs = model_args.get('comparison_kwargs', {})
+    sampling_name = model_args.get('sampling', 'GumbelSoftmax')
+    sampling_kwargs = model_args.get('sampling_kwargs', {})
     logic_kwargs['tnorm'] = getattr(logic, tnorm_name)(**tnorm_kwargs)
     logic_kwargs['complement'] = getattr(logic, comp_name)(**comp_kwargs)
     logic_kwargs['comparison'] = getattr(logic, compare_name)(**compare_kwargs)
+    logic_kwargs['sampling'] = getattr(logic, sampling_name)(**sampling_kwargs)
     # read the policy settings
     plan_method = planner_args.pop('method')
@@ -113,7 +115,8 @@ def _load_config(config, args):
     # policy initialization
     plan_initializer = plan_kwargs.get('initializer', None)
     if plan_initializer is not None:
-        initializer = _getattr_any(packages=[initializers], item=plan_initializer)
+        initializer = _getattr_any(
+            packages=[initializers, hk.initializers], item=plan_initializer)
         if initializer is None:
             raise_warning(
                 f'Ignoring invalid initializer <{plan_initializer}>.', 'red')
@@ -130,7 +133,8 @@ def _load_config(config, args):
     # policy activation
     plan_activation = plan_kwargs.get('activation', None)
     if plan_activation is not None:
-        activation = _getattr_any(packages=[jax.nn, jax.numpy], item=plan_activation)
+        activation = _getattr_any(
+            packages=[jax.nn, jax.numpy], item=plan_activation)
         if activation is None:
             raise_warning(
                 f'Ignoring invalid activation <{plan_activation}>.', 'red')
@@ -180,18 +184,6 @@ def load_config_from_string(value: str) -> Tuple[Kwargs, ...]:
 #
 # ***********************************************************************
-def _function_discrete_approx_named(logic):
-    jax_discrete, jax_param = logic.discrete()
-    def _jax_wrapped_discrete_calc_approx(key, prob, params):
-        sample = jax_discrete(key, prob, params)
-        out_of_bounds = jnp.logical_not(jnp.logical_and(
-            jnp.all(prob >= 0),
-            jnp.allclose(jnp.sum(prob, axis=-1), 1.0)))
-        return sample, out_of_bounds
-    return _jax_wrapped_discrete_calc_approx, jax_param
 class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
     '''Compiles a RDDL AST representation to an equivalent JAX representation.
@@ -217,6 +209,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         :param *kwargs: keyword arguments to pass to base compiler
         '''
         super(JaxRDDLCompilerWithGrad, self).__init__(*args, **kwargs)
         self.logic = logic
         self.logic.set_use64bit(self.use64bit)
         if cpfs_without_grad is None:
@@ -224,9 +217,14 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         self.cpfs_without_grad = cpfs_without_grad
         # actions and CPFs must be continuous
-        raise_warning('Initial values of pvariables will be cast to real.')
+        pvars_cast = set()
         for (var, values) in self.init_values.items():
             self.init_values[var] = np.asarray(values, dtype=self.REAL)
+            if not np.issubdtype(np.atleast_1d(values).dtype, np.floating):
+                pvars_cast.add(var)
+        if pvars_cast:
+            raise_warning(f'JAX gradient compiler requires that initial values '
+                          f'of p-variables {pvars_cast} be cast to float.')
         # overwrite basic operations with fuzzy ones
         self.RELATIONAL_OPS = {
@@ -261,7 +259,9 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         self.IF_HELPER = logic.control_if()
         self.SWITCH_HELPER = logic.control_switch()
         self.BERNOULLI_HELPER = logic.bernoulli()
-        self.DISCRETE_HELPER = _function_discrete_approx_named(logic)
+        self.DISCRETE_HELPER = logic.discrete()
+        self.POISSON_HELPER = logic.poisson()
+        self.GEOMETRIC_HELPER = logic.geometric()
     def _jax_stop_grad(self, jax_expr):
@@ -273,20 +273,29 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         return _jax_wrapped_stop_grad
     def _compile_cpfs(self, info):
-        raise_warning('CPFs outputs will be cast to real.')
+        cpfs_cast = set()
         jax_cpfs = {}
         for (_, cpfs) in self.levels.items():
             for cpf in cpfs:
                 _, expr = self.rddl.cpfs[cpf]
                 jax_cpfs[cpf] = self._jax(expr, info, dtype=self.REAL)
+                if self.rddl.variable_ranges[cpf] != 'real':
+                    cpfs_cast.add(cpf)
                 if cpf in self.cpfs_without_grad:
-                    raise_warning(f'CPF <{cpf}> stops gradient.')
                     jax_cpfs[cpf] = self._jax_stop_grad(jax_cpfs[cpf])
+        if cpfs_cast:
+            raise_warning(f'JAX gradient compiler requires that outputs of CPFs '
+                          f'{cpfs_cast} be cast to float.')
+        if self.cpfs_without_grad:
+            raise_warning(f'User requested that gradients not flow '
+                          f'through CPFs {self.cpfs_without_grad}.')
         return jax_cpfs
     def _jax_kron(self, expr, info):
         if self.logic.verbose:
-            raise_warning('KronDelta will be ignored.')
+            raise_warning('JAX gradient compiler ignores KronDelta '
+                          'during compilation.')
         arg, = expr.args
         arg = self._jax(arg, info)
         return arg
@@ -308,7 +317,8 @@ class JaxPlan:
         self._train_policy = None
         self._test_policy = None
         self._projection = None
+        self.bounds = None
     def summarize_hyperparameters(self) -> None:
         pass
@@ -363,7 +373,7 @@ class JaxPlan:
             # check invalid type
             if prange not in compiled.JAX_TYPES:
                 raise RDDLTypeError(
-                    f'Invalid range <{prange}. of action-fluent <{name}>, '
+                    f'Invalid range <{prange}> of action-fluent <{name}>, '
                     f'must be one of {set(compiled.JAX_TYPES.keys())}.')
             # clip boolean to (0, 1), otherwise use the RDDL action bounds
@@ -385,7 +395,7 @@ class JaxPlan:
                                     ~lower_finite & upper_finite,
                                     ~lower_finite & ~upper_finite]
             bounds[name] = (lower, upper)
-            raise_warning(f'Bounds of action fluent <{name}> set to {bounds[name]}.')
+            raise_warning(f'Bounds of action-fluent <{name}> set to {bounds[name]}.')
         return shapes, bounds, bounds_safe, cond_lists
     def _count_bool_actions(self, rddl: RDDLLiftedModel):
@@ -427,6 +437,7 @@ class JaxStraightLinePlan(JaxPlan):
         use_new_projection = True
         '''
         super(JaxStraightLinePlan, self).__init__()
         self._initializer_base = initializer
         self._initializer = initializer
         self._wrap_sigmoid = wrap_sigmoid
@@ -437,15 +448,19 @@ class JaxStraightLinePlan(JaxPlan):
         self._max_constraint_iter = max_constraint_iter
     def summarize_hyperparameters(self) -> None:
+        bounds = '\n        '.join(
+            map(lambda kv: f'{kv[0]}: {kv[1]}', self.bounds.items()))
         print(f'policy hyper-parameters:\n'
-              f'    initializer          ={type(self._initializer_base).__name__}\n'
+              f'    initializer          ={self._initializer_base}\n'
               f'constraint-sat strategy (simple):\n'
+              f'    parsed_action_bounds =\n        {bounds}\n'
               f'    wrap_sigmoid         ={self._wrap_sigmoid}\n'
               f'    wrap_sigmoid_min_prob={self._min_action_prob}\n'
               f'    wrap_non_bool        ={self._wrap_non_bool}\n'
               f'constraint-sat strategy (complex):\n'
               f'    wrap_softmax         ={self._wrap_softmax}\n'
-              f'    use_new_projection   ={self._use_new_projection}')
+              f'    use_new_projection   ={self._use_new_projection}\n'
+              f'    max_projection_iters ={self._max_constraint_iter}')
     def compile(self, compiled: JaxRDDLCompilerWithGrad,
                 _bounds: Bounds,
@@ -603,7 +618,7 @@ class JaxStraightLinePlan(JaxPlan):
             if 1 < allowed_actions < bool_action_count:
                 raise RDDLNotImplementedError(
                     f'Straight-line plans with wrap_softmax currently '
-                    f'do not support max-nondef-actions = {allowed_actions} > 1.')
+                    f'do not support max-nondef-actions {allowed_actions} > 1.')
             # potentially apply projection but to non-bool actions only
             self.projection = _jax_wrapped_slp_project_to_box
@@ -734,14 +749,14 @@ class JaxStraightLinePlan(JaxPlan):
             for (var, shape) in shapes.items():
                 if ranges[var] != 'bool' or not stack_bool_params:
                     key, subkey = random.split(key)
-                    param = init(subkey, shape, dtype=compiled.REAL)
+                    param = init(key=subkey, shape=shape, dtype=compiled.REAL)
                     if ranges[var] == 'bool':
                         param += bool_threshold
                     params[var] = param
             if stack_bool_params:
                 key, subkey = random.split(key)
                 bool_shape = (horizon, bool_action_count)
-                bool_param = init(subkey, bool_shape, dtype=compiled.REAL)
+                bool_param = init(key=subkey, shape=bool_shape, dtype=compiled.REAL)
                 params[bool_key] = bool_param
             params, _ = _jax_wrapped_slp_project_to_box(params, hyperparams)
             return params
@@ -765,7 +780,8 @@ class JaxDeepReactivePolicy(JaxPlan):
     def __init__(self, topology: Optional[Sequence[int]]=None,
                  activation: Activation=jnp.tanh,
                  initializer: hk.initializers.Initializer=hk.initializers.VarianceScaling(scale=2.0),
-                 normalize: bool=True,
+                 normalize: bool=False,
+                 normalize_per_layer: bool=False,
                  normalizer_kwargs: Optional[Kwargs]=None,
                  wrap_non_bool: bool=False) -> None:
         '''Creates a new deep reactive policy in JAX.
@@ -775,12 +791,15 @@ class JaxDeepReactivePolicy(JaxPlan):
         :param activation: function to apply after each layer of the policy
         :param initializer: weight initialization
         :param normalize: whether to apply layer norm to the inputs
+        :param normalize_per_layer: whether to apply layer norm to each input
+        individually (only active if normalize is True)
         :param normalizer_kwargs: if normalize is True, apply additional arguments
         to layer norm
         :param wrap_non_bool: whether to wrap real or int action fluent parameters
         with non-linearity (e.g. sigmoid or ELU) to satisfy box constraints
         '''
         super(JaxDeepReactivePolicy, self).__init__()
         if topology is None:
             topology = [128, 64]
         self._topology = topology
@@ -788,22 +807,25 @@ class JaxDeepReactivePolicy(JaxPlan):
         self._initializer_base = initializer
         self._initializer = initializer
         self._normalize = normalize
+        self._normalize_per_layer = normalize_per_layer
         if normalizer_kwargs is None:
-            normalizer_kwargs = {
-                'create_offset': True, 'create_scale': True,
-                'name': 'input_norm'
-            }
+            normalizer_kwargs = {'create_offset': True, 'create_scale': True}
         self._normalizer_kwargs = normalizer_kwargs
         self._wrap_non_bool = wrap_non_bool
     def summarize_hyperparameters(self) -> None:
+        bounds = '\n        '.join(
+            map(lambda kv: f'{kv[0]}: {kv[1]}', self.bounds.items()))
         print(f'policy hyper-parameters:\n'
-              f'    topology        ={self._topology}\n'
-              f'    activation_fn   ={self._activations[0].__name__}\n'
-              f'    initializer     ={type(self._initializer_base).__name__}\n'
-              f'    apply_layer_norm={self._normalize}\n'
-              f'    layer_norm_args ={self._normalizer_kwargs}\n'
-              f'    wrap_non_bool   ={self._wrap_non_bool}')
+              f'    topology            ={self._topology}\n'
+              f'    activation_fn       ={self._activations[0].__name__}\n'
+              f'    initializer         ={type(self._initializer_base).__name__}\n'
+              f'    apply_input_norm    ={self._normalize}\n'
+              f'    input_norm_layerwise={self._normalize_per_layer}\n'
+              f'    input_norm_args     ={self._normalizer_kwargs}\n'
+              f'constraint-sat strategy:\n'
+              f'    parsed_action_bounds=\n        {bounds}\n'
+              f'    wrap_non_bool       ={self._wrap_non_bool}')
     def compile(self, compiled: JaxRDDLCompilerWithGrad,
                 _bounds: Bounds,
@@ -821,7 +843,7 @@ class JaxDeepReactivePolicy(JaxPlan):
         if 1 < allowed_actions < bool_action_count:
             raise RDDLNotImplementedError(
                 f'Deep reactive policies currently do not support '
-                f'max-nondef-actions = {allowed_actions} > 1.')
+                f'max-nondef-actions {allowed_actions} > 1.')
         use_constraint_satisfaction = allowed_actions < bool_action_count
         noop = {var: (values[0] if isinstance(values, list) else values)
@@ -835,6 +857,7 @@ class JaxDeepReactivePolicy(JaxPlan):
         ranges = rddl.variable_ranges
         normalize = self._normalize
+        normalize_per_layer = self._normalize_per_layer
         wrap_non_bool = self._wrap_non_bool
         init = self._initializer
         layers = list(enumerate(zip(self._topology, self._activations)))
@@ -842,14 +865,67 @@ class JaxDeepReactivePolicy(JaxPlan):
                        for (var, shape) in shapes.items()}
         layer_names = {var: f'output_{var}'.replace('-', '_') for var in shapes}
-        # predict actions from the policy network for current state
-        def _jax_wrapped_policy_network_predict(state):
+        # inputs for the policy network
+        if rddl.observ_fluents:
+            observed_vars = rddl.observ_fluents
+        else:
+            observed_vars = rddl.state_fluents
+        input_names = {var: f'{var}'.replace('-', '_') for var in observed_vars}
+        # catch if input norm is applied to size 1 tensor
+        if normalize:
+            non_bool_dims = 0
+            for (var, values) in observed_vars.items():
+                if ranges[var] != 'bool':
+                    value_size = np.atleast_1d(values).size
+                    if normalize_per_layer and value_size == 1:
+                        raise_warning(
+                            f'Cannot apply layer norm to state-fluent <{var}> '
+                            f'of size 1: setting normalize_per_layer = False.',
+                            'red')
+                        normalize_per_layer = False
+                    non_bool_dims += value_size
+            if not normalize_per_layer and non_bool_dims == 1:
+                raise_warning(
+                    'Cannot apply layer norm to state-fluents of total size 1: '
+                    'setting normalize = False.', 'red')
+                normalize = False
+        # convert subs dictionary into a state vector to feed to the MLP
+        def _jax_wrapped_policy_input(subs):
-            # apply layer norm
-            if normalize:
+            # concatenate all state variables into a single vector
+            # optionally apply layer norm to each input tensor
+            states_bool, states_non_bool = [], []
+            non_bool_dims = 0
+            for (var, value) in subs.items():
+                if var in observed_vars:
+                    state = jnp.ravel(value)
+                    if ranges[var] == 'bool':
+                        states_bool.append(state)
+                    else:
+                        if normalize and normalize_per_layer:
+                            normalizer = hk.LayerNorm(
+                                axis=-1, param_axis=-1,
+                                name=f'input_norm_{input_names[var]}',
+                                **self._normalizer_kwargs)
+                            state = normalizer(state)
+                        states_non_bool.append(state)
+                        non_bool_dims += state.size
+            state = jnp.concatenate(states_non_bool + states_bool)
+            # optionally perform layer normalization on the non-bool inputs
+            if normalize and not normalize_per_layer and non_bool_dims:
                 normalizer = hk.LayerNorm(
-                    axis=-1, param_axis=-1, **self._normalizer_kwargs)
-                state = normalizer(state)
+                    axis=-1, param_axis=-1, name='input_norm',
+                    **self._normalizer_kwargs)
+                normalized = normalizer(state[:non_bool_dims])
+                state = state.at[:non_bool_dims].set(normalized)
+            return state
+        # predict actions from the policy network for current state
+        def _jax_wrapped_policy_network_predict(subs):
+            state = _jax_wrapped_policy_input(subs)
             # feed state vector through hidden layers
             hidden = state
@@ -913,25 +989,9 @@ class JaxDeepReactivePolicy(JaxPlan):
                     start += size
             return actions
-        if rddl.observ_fluents:
-            observed_vars = rddl.observ_fluents
-        else:
-            observed_vars = rddl.state_fluents
-        # state is concatenated into single tensor
-        def _jax_wrapped_subs_to_state(subs):
-            subs = {var: value
-                    for (var, value) in subs.items()
-                    if var in observed_vars}
-            flat_subs = jax.tree_map(jnp.ravel, subs)
-            states = list(flat_subs.values())
-            state = jnp.concatenate(states)
-            return state
         # train action prediction
         def _jax_wrapped_drp_predict_train(key, params, hyperparams, step, subs):
-            state = _jax_wrapped_subs_to_state(subs)
-            actions = predict_fn.apply(params, state)
+            actions = predict_fn.apply(params, subs)
             if not wrap_non_bool:
                 for (var, action) in actions.items():
                     if var != bool_key and ranges[var] != 'bool':
@@ -982,8 +1042,7 @@ class JaxDeepReactivePolicy(JaxPlan):
             subs = {var: value[0, ...]
                     for (var, value) in subs.items()
                     if var in observed_vars}
-            state = _jax_wrapped_subs_to_state(subs)
-            params = predict_fn.init(key, state)
+            params = predict_fn.init(key, subs)
             return params
         self.initializer = _jax_wrapped_drp_init
@@ -1021,46 +1080,72 @@ class RollingMean:
 class JaxPlannerPlot:
     '''Supports plotting and visualization of a JAX policy in real time.'''
-    def __init__(self, rddl: RDDLPlanningModel, horizon: int) -> None:
-        self._fig, axes = plt.subplots(1 + len(rddl.action_fluents))
+    def __init__(self, rddl: RDDLPlanningModel, horizon: int,
+                 show_violin: bool=True, show_action: bool=True) -> None:
+        '''Creates a new planner visualizer.
+        :param rddl: the planning model to optimize
+        :param horizon: the lookahead or planning horizon
+        :param show_violin: whether to show the distribution of batch losses
+        :param show_action: whether to show heatmaps of the action fluents
+        '''
+        num_plots = 1
+        if show_violin:
+            num_plots += 1
+        if show_action:
+            num_plots += len(rddl.action_fluents)
+        self._fig, axes = plt.subplots(num_plots)
+        if num_plots == 1:
+            axes = [axes]
         # prepare the loss plot
         self._loss_ax = axes[0]
         self._loss_ax.autoscale(enable=True)
-        self._loss_ax.set_xlabel('decision epoch')
+        self._loss_ax.set_xlabel('training time')
         self._loss_ax.set_ylabel('loss value')
         self._loss_plot = self._loss_ax.plot(
             [], [], linestyle=':', marker='o', markersize=2)[0]
         self._loss_back = self._fig.canvas.copy_from_bbox(self._loss_ax.bbox)
+        # prepare the violin plot
+        if show_violin:
+            self._hist_ax = axes[1]
+        else:
+            self._hist_ax = None
         # prepare the action plots
-        self._action_ax = {name: axes[idx + 1]
-                           for (idx, name) in enumerate(rddl.action_fluents)}
-        self._action_plots = {}
-        for name in rddl.action_fluents:
-            ax = self._action_ax[name]
-            if rddl.variable_ranges[name] == 'bool':
-                vmin, vmax = 0.0, 1.0
-            else:
-                vmin, vmax = None, None
-            action_dim = 1
-            for dim in rddl.object_counts(rddl.variable_params[name]):
-                action_dim *= dim
-            action_plot = ax.pcolormesh(
-                np.zeros((action_dim, horizon)),
-                cmap='seismic', vmin=vmin, vmax=vmax)
-            ax.set_aspect('auto')
-            ax.set_xlabel('decision epoch')
-            ax.set_ylabel(name)
-            plt.colorbar(action_plot, ax=ax)
-            self._action_plots[name] = action_plot
-        self._action_back = {name: self._fig.canvas.copy_from_bbox(ax.bbox)
-                             for (name, ax) in self._action_ax.items()}
+        if show_action:
+            self._action_ax = {name: axes[idx + (2 if show_violin else 1)]
+                               for (idx, name) in enumerate(rddl.action_fluents)}
+            self._action_plots = {}
+            for name in rddl.action_fluents:
+                ax = self._action_ax[name]
+                if rddl.variable_ranges[name] == 'bool':
+                    vmin, vmax = 0.0, 1.0
+                else:
+                    vmin, vmax = None, None
+                action_dim = 1
+                for dim in rddl.object_counts(rddl.variable_params[name]):
+                    action_dim *= dim
+                action_plot = ax.pcolormesh(
+                    np.zeros((action_dim, horizon)),
+                    cmap='seismic', vmin=vmin, vmax=vmax)
+                ax.set_aspect('auto')
+                ax.set_xlabel('decision epoch')
+                ax.set_ylabel(name)
+                plt.colorbar(action_plot, ax=ax)
+                self._action_plots[name] = action_plot
+            self._action_back = {name: self._fig.canvas.copy_from_bbox(ax.bbox)
+                                 for (name, ax) in self._action_ax.items()}
+        else:
+            self._action_ax = None
+            self._action_plots = None
+            self._action_back = None
         plt.tight_layout()
         plt.show(block=False)
-    def redraw(self, xticks, losses, actions) -> None:
+    def redraw(self, xticks, losses, actions, returns) -> None:
         # draw the loss curve
         self._fig.canvas.restore_region(self._loss_back)
@@ -1071,21 +1156,30 @@ class JaxPlannerPlot:
         self._loss_ax.draw_artist(self._loss_plot)
         self._fig.canvas.blit(self._loss_ax.bbox)
+        # draw the violin plot
+        if self._hist_ax is not None:
+            self._hist_ax.clear()
+            self._hist_ax.set_xlabel('loss value')
+            self._hist_ax.set_ylabel('density')
+            self._hist_ax.violinplot(returns, vert=False, showmeans=True)
         # draw the actions
-        for (name, values) in actions.items():
-            values = np.mean(values, axis=0, dtype=float)
-            values = np.reshape(values, newshape=(values.shape[0], -1)).T
-            self._fig.canvas.restore_region(self._action_back[name])
-            self._action_plots[name].set_array(values)
-            self._action_ax[name].draw_artist(self._action_plots[name])
-            self._fig.canvas.blit(self._action_ax[name].bbox)
-            self._action_plots[name].set_clim([np.min(values), np.max(values)])
+        if self._action_ax is not None:
+            for (name, values) in actions.items():
+                values = np.mean(values, axis=0, dtype=float)
+                values = np.reshape(values, newshape=(values.shape[0], -1)).T
+                self._fig.canvas.restore_region(self._action_back[name])
+                self._action_plots[name].set_array(values)
+                self._action_ax[name].draw_artist(self._action_plots[name])
+                self._fig.canvas.blit(self._action_ax[name].bbox)
+                self._action_plots[name].set_clim([np.min(values), np.max(values)])
         self._fig.canvas.draw()
         self._fig.canvas.flush_events()
     def close(self) -> None:
         plt.close(self._fig)
-        del self._loss_ax, self._action_ax, \
+        del self._loss_ax, self._hist_ax, self._action_ax, \
             self._loss_plot, self._action_plots, self._fig, \
             self._loss_back, self._action_back
@@ -1099,9 +1193,9 @@ class JaxPlannerStatus(Enum):
     NORMAL = 0
     NO_PROGRESS = 1
     PRECONDITION_POSSIBLY_UNSATISFIED = 2
-    TIME_BUDGET_REACHED = 3
-    ITER_BUDGET_REACHED = 4
-    INVALID_GRADIENT = 5
+    INVALID_GRADIENT = 3
+    TIME_BUDGET_REACHED = 4
+    ITER_BUDGET_REACHED = 5
     def is_failure(self) -> bool:
         return self.value >= 3
@@ -1245,30 +1339,41 @@ class JaxBackpropPlanner:
                 map(str, jax._src.xla_bridge.devices())).replace('\n', '')
         except Exception as _:
             devices_short = 'N/A'
+        LOGO = \
+"""
+   __    ______    __  __    ______  __        ______    __   __
+  /\ \  /\  __ \  /\_\_\_\  /\  == \/\ \      /\  __ \  /\ "-.\ \
+ _\_\ \ \ \  __ \ \/_/\_\/_ \ \  _-/\ \ \____ \ \  __ \ \ \ \-.  \
+/\_____\ \ \_\ \_\  /\_\/\_\ \ \_\   \ \_____\ \ \_\ \_\ \ \_\\"\_\
+\/_____/  \/_/\/_/  \/_/\/_/  \/_/    \/_____/  \/_/\/_/  \/_/ \/_/
+"""
         print('\n'
-              f'JAX Planner version {__version__}\n'
+              f'{LOGO}\n'
+              f'Version {__version__}\n'
               f'Python {sys.version}\n'
               f'jax {jax.version.__version__}, jaxlib {jaxlib_version}, '
+              f'optax {optax.__version__}, haiku {hk.__version__}, '
               f'numpy {np.__version__}\n'
               f'devices: {devices_short}\n')
     def summarize_hyperparameters(self) -> None:
         print(f'objective hyper-parameters:\n'
-              f'    utility_fn      ={self.utility.__name__}\n'
-              f'    utility args    ={self.utility_kwargs}\n'
-              f'    use_symlog      ={self.use_symlog_reward}\n'
-              f'    lookahead       ={self.horizon}\n'
-              f'    action_bounds   ={self._action_bounds}\n'
-              f'    fuzzy logic type={type(self.logic).__name__}\n'
-              f'    nonfluents exact={self.compile_non_fluent_exact}\n'
-              f'    cpfs_no_gradient={self.cpfs_without_grad}\n'
+              f'    utility_fn        ={self.utility.__name__}\n'
+              f'    utility args      ={self.utility_kwargs}\n'
+              f'    use_symlog        ={self.use_symlog_reward}\n'
+              f'    lookahead         ={self.horizon}\n'
+              f'    user_action_bounds={self._action_bounds}\n'
+              f'    fuzzy logic type  ={type(self.logic).__name__}\n'
+              f'    nonfluents exact  ={self.compile_non_fluent_exact}\n'
+              f'    cpfs_no_gradient  ={self.cpfs_without_grad}\n'
               f'optimizer hyper-parameters:\n'
-              f'    use_64_bit      ={self.use64bit}\n'
-              f'    optimizer       ={self._optimizer_name.__name__}\n'
-              f'    optimizer args  ={self._optimizer_kwargs}\n'
-              f'    clip_gradient   ={self.clip_grad}\n'
-              f'    batch_size_train={self.batch_size_train}\n'
-              f'    batch_size_test ={self.batch_size_test}')
+              f'    use_64_bit        ={self.use64bit}\n'
+              f'    optimizer         ={self._optimizer_name.__name__}\n'
+              f'    optimizer args    ={self._optimizer_kwargs}\n'
+              f'    clip_gradient     ={self.clip_grad}\n'
+              f'    batch_size_train  ={self.batch_size_train}\n'
+              f'    batch_size_test   ={self.batch_size_test}')
         self.plan.summarize_hyperparameters()
         self.logic.summarize_hyperparameters()
@@ -1310,6 +1415,7 @@ class JaxBackpropPlanner:
             policy=self.plan.train_policy,
             n_steps=self.horizon,
             n_batch=self.batch_size_train)
+        self.train_rollouts = train_rollouts
         test_rollouts = self.test_compiled.compile_rollouts(
             policy=self.plan.test_policy,
@@ -1417,17 +1523,106 @@ class JaxBackpropPlanner:
         return init_train, init_test
+    def as_optimization_problem(
+            self, key: Optional[random.PRNGKey]=None,
+            policy_hyperparams: Optional[Pytree]=None,
+            loss_function_updates_key: bool=True,
+            grad_function_updates_key: bool=False) -> Tuple[Callable, Callable, np.ndarray, Callable]:
+        '''Returns a function that computes the loss and a function that
+        computes gradient of the return as a 1D vector given a 1D representation
+        of policy parameters. These functions are designed to be compatible with
+        off-the-shelf optimizers such as scipy.
+        Also returns the initial parameter vector to seed an optimizer,
+        as well as a mapping that recovers the parameter pytree from the vector.
+        The PRNG key is updated internally starting from the optional given key.
+        Constraints on actions, if they are required, cannot be constructed
+        automatically in the general case. The user should build constraints
+        for each problem in the format required by the downstream optimizer.
+        :param key: JAX PRNG key (derived from clock if not provided)
+        :param policy_hyperparameters: hyper-parameters for the policy/plan,
+        such as weights for sigmoid wrapping boolean actions (defaults to 1
+        for all action-fluents if not provided)
+        :param loss_function_updates_key: if True, the loss function
+        updates the PRNG key internally independently of the grad function
+        :param grad_function_updates_key: if True, the gradient function
+        updates the PRNG key internally independently of the loss function.
+        '''
+        # if PRNG key is not provided
+        if key is None:
+            key = random.PRNGKey(round(time.time() * 1000))
+        # initialize the initial fluents, model parameters, policy hyper-params
+        subs = self.test_compiled.init_values
+        train_subs, _ = self._batched_init_subs(subs)
+        model_params = self.compiled.model_params
+        if policy_hyperparams is None:
+            raise_warning('policy_hyperparams is not set, setting 1.0 for '
+                          'all action-fluents which could be suboptimal.')
+            policy_hyperparams = {action: 1.0
+                                  for action in self.rddl.action_fluents}
+        # initialize the policy parameters
+        params_guess, *_ = self.initialize(key, policy_hyperparams, train_subs)
+        guess_1d, unravel_fn = jax.flatten_util.ravel_pytree(params_guess)
+        guess_1d = np.asarray(guess_1d)
+        # computes the training loss function and its 1D gradient
+        loss_fn = self._jax_loss(self.train_rollouts)
+        @jax.jit
+        def _loss_with_key(key, params_1d):
+            policy_params = unravel_fn(params_1d)
+            loss_val, _ = loss_fn(key, policy_params, policy_hyperparams,
+                                  train_subs, model_params)
+            return loss_val
+        @jax.jit
+        def _grad_with_key(key, params_1d):
+            policy_params = unravel_fn(params_1d)
+            grad_fn = jax.grad(loss_fn, argnums=1, has_aux=True)
+            grad_val, _ = grad_fn(key, policy_params, policy_hyperparams,
+                                  train_subs, model_params)
+            grad_1d = jax.flatten_util.ravel_pytree(grad_val)[0]
+            return grad_1d
+        def _loss_function(params_1d):
+            nonlocal key
+            if loss_function_updates_key:
+                key, subkey = random.split(key)
+            else:
+                subkey = key
+            loss_val = _loss_with_key(subkey, params_1d)
+            loss_val = float(loss_val)
+            return loss_val
+        def _grad_function(params_1d):
+            nonlocal key
+            if grad_function_updates_key:
+                key, subkey = random.split(key)
+            else:
+                subkey = key
+            grad = _grad_with_key(subkey, params_1d)
+            grad = np.asarray(grad)
+            return grad
+        return _loss_function, _grad_function, guess_1d, jax.jit(unravel_fn)
     # ===========================================================================
     # OPTIMIZE API
     # ===========================================================================
     def optimize(self, *args, **kwargs) -> Dict[str, Any]:
-        ''' Compute an optimal policy or plan. Return the callback from training.
+        '''Compute an optimal policy or plan. Return the callback from training.
         :param key: JAX PRNG key (derived from clock if not provided)
         :param epochs: the maximum number of steps of gradient descent
         :param train_seconds: total time allocated for gradient descent
         :param plot_step: frequency to plot the plan and save result to disk
+        :param plot_kwargs: additional arguments to pass to the plotter
         :param model_params: optional model-parameters to override default
         :param policy_hyperparams: hyper-parameters for the policy/plan, such as
         weights for sigmoid wrapping boolean actions
@@ -1435,7 +1630,9 @@ class JaxBackpropPlanner:
         their values: if None initializes all variables from the RDDL instance
         :param guess: initial policy parameters: if None will use the initializer
         specified in this instance
-        :param verbose: not print (0), print summary (1), print progress (2)
+        :param print_summary: whether to print planner header, parameter
+        summary, and diagnosis
+        :param print_progress: whether to print the progress bar during training
         :param test_rolling_window: the test return is averaged on a rolling
         window of the past test_rolling_window returns when updating the best
         parameters found so far
@@ -1461,11 +1658,13 @@ class JaxBackpropPlanner:
                            epochs: int=999999,
                            train_seconds: float=120.,
                            plot_step: Optional[int]=None,
+                           plot_kwargs: Optional[Dict[str, Any]]=None,
                            model_params: Optional[Dict[str, Any]]=None,
                            policy_hyperparams: Optional[Dict[str, Any]]=None,
                            subs: Optional[Dict[str, Any]]=None,
                            guess: Optional[Pytree]=None,
-                           verbose: int=2,
+                           print_summary: bool=True,
+                           print_progress: bool=True,
                            test_rolling_window: int=10,
                            tqdm_position: Optional[int]=None) -> Generator[Dict[str, Any], None, None]:
         '''Returns a generator for computing an optimal policy or plan.
@@ -1476,20 +1675,22 @@ class JaxBackpropPlanner:
         :param epochs: the maximum number of steps of gradient descent
         :param train_seconds: total time allocated for gradient descent
         :param plot_step: frequency to plot the plan and save result to disk
+        :param plot_kwargs: additional arguments to pass to the plotter
         :param model_params: optional model-parameters to override default
         :param policy_hyperparams: hyper-parameters for the policy/plan, such as
         weights for sigmoid wrapping boolean actions
         :param subs: dictionary mapping initial state and non-fluents to
         their values: if None initializes all variables from the RDDL instance
         :param guess: initial policy parameters: if None will use the initializer
-        specified in this instance
-        :param verbose: not print (0), print summary (1), print progress (2)
+        specified in this instance
+        :param print_summary: whether to print planner header, parameter
+        summary, and diagnosis
+        :param print_progress: whether to print the progress bar during training
         :param test_rolling_window: the test return is averaged on a rolling
         window of the past test_rolling_window returns when updating the best
         parameters found so far
         :param tqdm_position: position of tqdm progress bar (for multiprocessing)
         '''
-        verbose = int(verbose)
         start_time = time.time()
         elapsed_outside_loop = 0
@@ -1511,9 +1712,17 @@ class JaxBackpropPlanner:
             hyperparam_value = float(policy_hyperparams)
             policy_hyperparams = {action: hyperparam_value
                                   for action in self.rddl.action_fluents}
+        # fill in missing entries
+        elif isinstance(policy_hyperparams, dict):
+            for action in self.rddl.action_fluents:
+                if action not in policy_hyperparams:
+                    raise_warning(f'policy_hyperparams[{action}] is not set, '
+                                  'setting 1.0 which could be suboptimal.')
+                    policy_hyperparams[action] = 1.0
         # print summary of parameters:
-        if verbose >= 1:
+        if print_summary:
             self._summarize_system()
             self.summarize_hyperparameters()
             print(f'optimize() call hyper-parameters:\n'
@@ -1526,8 +1735,10 @@ class JaxBackpropPlanner:
                   f'    provide_param_guess={guess is not None}\n'
                   f'    test_rolling_window={test_rolling_window}\n'
                   f'    plot_frequency     ={plot_step}\n'
-                  f'    verbose            ={verbose}\n')
-            if verbose >= 2 and self.compiled.relaxations:
+                  f'    plot_kwargs        ={plot_kwargs}\n'
+                  f'    print_summary      ={print_summary}\n'
+                  f'    print_progress     ={print_progress}\n')
+            if self.compiled.relaxations:
                 print('Some RDDL operations are non-differentiable, '
                       'replacing them with differentiable relaxations:')
                 print(self.compiled.summarize_model_relaxations())
@@ -1549,7 +1760,7 @@ class JaxBackpropPlanner:
                               'from the RDDL files.')
         train_subs, test_subs = self._batched_init_subs(subs)
-        # initialize, model parameters
+        # initialize model parameters
         if model_params is None:
             model_params = self.compiled.model_params
         model_params_test = self.test_compiled.model_params
@@ -1570,27 +1781,40 @@ class JaxBackpropPlanner:
         rolling_test_loss = RollingMean(test_rolling_window)
         log = {}
         status = JaxPlannerStatus.NORMAL
+        is_all_zero_fn = lambda x: np.allclose(x, 0)
         # initialize plot area
         if plot_step is None or plot_step <= 0 or plt is None:
             plot = None
         else:
-            plot = JaxPlannerPlot(self.rddl, self.horizon)
+            if plot_kwargs is None:
+                plot_kwargs = {}
+            plot = JaxPlannerPlot(self.rddl, self.horizon, **plot_kwargs)
         xticks, loss_values = [], []
         # training loop
         iters = range(epochs)
-        if verbose >= 2:
+        if print_progress:
             iters = tqdm(iters, total=100, position=tqdm_position)
+        position_str = '' if tqdm_position is None else f'[{tqdm_position}]'
         for it in iters:
             status = JaxPlannerStatus.NORMAL
             # update the parameters of the plan
             key, subkey = random.split(key)
-            policy_params, converged, opt_state, opt_aux, train_loss, train_log = \
+            policy_params, converged, opt_state, opt_aux, \
+            train_loss, train_log = \
                 self.update(subkey, policy_params, policy_hyperparams,
                             train_subs, model_params, opt_state, opt_aux)
+            # no progress
+            grad_norm_zero, _ = jax.tree_util.tree_flatten(
+                jax.tree_map(is_all_zero_fn, train_log['grad']))
+            if np.all(grad_norm_zero):
+                status = JaxPlannerStatus.NO_PROGRESS
+            # constraint satisfaction problem
             if not np.all(converged):
                 raise_warning(
                     'Projected gradient method for satisfying action concurrency '
@@ -1598,13 +1822,18 @@ class JaxBackpropPlanner:
                     'invalid for the current instance.', 'red')
                 status = JaxPlannerStatus.PRECONDITION_POSSIBLY_UNSATISFIED
-            # evaluate losses
+            # numerical error
+            if not np.isfinite(train_loss):
+                raise_warning(
+                    f'Aborting JAX planner due to invalid train loss {train_loss}.',
+                    'red')
+                status = JaxPlannerStatus.INVALID_GRADIENT
+            # evaluate test losses and record best plan so far
             test_loss, log = self.test_loss(
                 subkey, policy_params, policy_hyperparams,
                 test_subs, model_params_test)
             test_loss = rolling_test_loss.update(test_loss)
-            # record the best plan so far
             if test_loss < best_loss:
                 best_params, best_loss, best_grad = \
                     policy_params, test_loss, train_log['grad']
@@ -1617,15 +1846,17 @@ class JaxBackpropPlanner:
                 action_values = {name: values
                                  for (name, values) in log['fluents'].items()
                                  if name in self.rddl.action_fluents}
-                plot.redraw(xticks, loss_values, action_values)
+                returns = -np.sum(np.asarray(log['reward']), axis=1)
+                plot.redraw(xticks, loss_values, action_values, returns)
             # if the progress bar is used
             elapsed = time.time() - start_time - elapsed_outside_loop
-            if verbose >= 2:
+            if print_progress:
                 iters.n = int(100 * min(1, max(elapsed / train_seconds, it / epochs)))
                 iters.set_description(
-                    f'[{tqdm_position}] {it:6} it / {-train_loss:14.6f} train / '
-                    f'{-test_loss:14.6f} test / {-best_loss:14.6f} best')
+                    f'{position_str} {it:6} it / {-train_loss:14.6f} train / '
+                    f'{-test_loss:14.6f} test / {-best_loss:14.6f} best / '
+                    f'{status.value} status')
             # reached computation budget
             if elapsed >= train_seconds:
@@ -1633,19 +1864,6 @@ class JaxBackpropPlanner:
             if it >= epochs - 1:
                 status = JaxPlannerStatus.ITER_BUDGET_REACHED
-            # numerical error
-            if not np.isfinite(train_loss):
-                raise_warning(
-                    f'Aborting JAX planner due to invalid train loss {train_loss}.',
-                    'red')
-                status = JaxPlannerStatus.INVALID_GRADIENT
-            # no progress
-            grad_norm_zero, _ = jax.tree_util.tree_flatten(
-                jax.tree_map(lambda x: np.allclose(x, 0), train_log['grad']))
-            if np.all(grad_norm_zero):
-                status = JaxPlannerStatus.NO_PROGRESS
             # return a callback
             start_time_outside = time.time()
             yield {
@@ -1671,7 +1889,7 @@ class JaxBackpropPlanner:
                 break
         # release resources
-        if verbose >= 2:
+        if print_progress:
             iters.close()
         if plot is not None:
             plot.close()
@@ -1688,7 +1906,7 @@ class JaxBackpropPlanner:
                               f'during test evaluation:\n{messages}', 'red')
         # summarize and test for convergence
-        if verbose >= 1:
+        if print_summary:
             grad_norm = jax.tree_map(lambda x: np.linalg.norm(x).item(), best_grad)
             diagnosis = self._perform_diagnosis(
                 last_iter_improve, -train_loss, -test_loss, -best_loss, grad_norm)
@@ -1698,7 +1916,7 @@ class JaxBackpropPlanner:
                   f'    iterations    ={it}\n'
                   f'    best_objective={-best_loss}\n'
                   f'    best_grad_norm={grad_norm}\n'
-                  f'diagnosis: {diagnosis}\n')
+                  f'    diagnosis: {diagnosis}\n')
     def _perform_diagnosis(self, last_iter_improve,
                            train_return, test_return, best_return, grad_norm):
@@ -1778,17 +1996,19 @@ class JaxBackpropPlanner:
                 raise ValueError(f'State dictionary passed to the JAX policy is '
                                  f'grounded, since it contains the key <{var}>, '
                                  f'but a vectorized environment is required: '
-                                 f'please make sure vectorized=True in the RDDLEnv.')
+                                 f'make sure vectorized = True in the RDDLEnv.')
             # must be numeric array
             # exception is for POMDPs at 1st epoch when observ-fluents are None
-            if not jnp.issubdtype(values.dtype, jnp.number) \
-            and not jnp.issubdtype(values.dtype, jnp.bool_):
+            dtype = np.atleast_1d(values).dtype
+            if not jnp.issubdtype(dtype, jnp.number) \
+            and not jnp.issubdtype(dtype, jnp.bool_):
                 if step == 0 and var in self.rddl.observ_fluents:
                     subs[var] = self.test_compiled.init_values[var]
                 else:
-                    raise ValueError(f'Values assigned to pvariable {var} are '
-                                     f'non-numeric of type {values.dtype}: {values}.')
+                    raise ValueError(
+                        f'Values {values} assigned to p-variable <{var}> are '
+                        f'non-numeric of type {dtype}.')
         # cast device arrays to numpy
         actions = self.test_policy(key, params, policy_hyperparams, step, subs)
@@ -1801,8 +2021,6 @@ class JaxLineSearchPlanner(JaxBackpropPlanner):
     linear search gradient descent, with the Armijo condition.'''
     def __init__(self, *args,
-                 optimizer: Callable[..., optax.GradientTransformation]=optax.sgd,
-                 optimizer_kwargs: Kwargs={'learning_rate': 1.0},
                  decay: float=0.8,
                  c: float=0.1,
                  step_max: float=1.0,
@@ -1825,11 +2043,7 @@ class JaxLineSearchPlanner(JaxBackpropPlanner):
             raise_warning('clip_grad parameter conflicts with '
                           'line search planner and will be ignored.', 'red')
             del kwargs['clip_grad']
-        super(JaxLineSearchPlanner, self).__init__(
-            *args,
-            optimizer=optimizer,
-            optimizer_kwargs=optimizer_kwargs,
-            **kwargs)
+        super(JaxLineSearchPlanner, self).__init__(*args, **kwargs)
     def summarize_hyperparameters(self) -> None:
         super(JaxLineSearchPlanner, self).summarize_hyperparameters()
@@ -1878,7 +2092,8 @@ class JaxLineSearchPlanner(JaxBackpropPlanner):
             step = lrmax / decay
             f_step = np.inf
             best_f, best_step, best_params, best_state = np.inf, None, None, None
-            while f_step > f - c * step * gnorm2 and step * decay >= lrmin:
+            while (f_step > f - c * step * gnorm2 and step * decay >= lrmin) \
+            or not trials:
                 trials += 1
                 step *= decay
                 f_step, new_params, new_state = _jax_wrapped_line_search_trial(
@@ -1913,12 +2128,12 @@ class JaxLineSearchPlanner(JaxBackpropPlanner):
 @jax.jit
 def entropic_utility(returns: jnp.ndarray, beta: float) -> float:
     return (-1.0 / beta) * jax.scipy.special.logsumexp(
-            -beta * returns, b=1.0 / returns.size)
+        -beta * returns, b=1.0 / returns.size)
 @jax.jit
 def mean_variance_utility(returns: jnp.ndarray, beta: float) -> float:
-    return jnp.mean(returns) - (beta / 2.0) * jnp.var(returns)
+    return jnp.mean(returns) - 0.5 * beta * jnp.var(returns)
 @jax.jit
@@ -1986,7 +2201,8 @@ class JaxOfflineController(BaseAgent):
     def reset(self) -> None:
         self.step = 0
         if self.train_on_reset and not self.params_given:
-            self.params = self.planner.optimize(key=self.key, **self.train_kwargs)
+            callback = self.planner.optimize(key=self.key, **self.train_kwargs)
+            self.params = callback['best_params']
 class JaxOnlineController(BaseAgent):

pyRDDLGym-jax 0.2__py3-none-any.whl → 0.4__py3-none-any.whl

pyRDDLGym-jax 0.2py3-none-any.whl → 0.4py3-none-any.whl