PyPI - pyRDDLGym-jax - Versions diffs - 2.4__py3-none-any.whl → 2.6__py3-none-any.whl - Mend

pyRDDLGym-jax 2.4py3-none-any.whl → 2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

pyRDDLGym_jax/__init__.py +1 -1
pyRDDLGym_jax/core/compiler.py +23 -10
pyRDDLGym_jax/core/logic.py +6 -8
pyRDDLGym_jax/core/model.py +595 -0
pyRDDLGym_jax/core/planner.py +317 -99
pyRDDLGym_jax/core/simulator.py +37 -13
pyRDDLGym_jax/core/tuning.py +25 -10
pyRDDLGym_jax/entry_point.py +39 -7
pyRDDLGym_jax/examples/configs/tuning_drp.cfg +1 -0
pyRDDLGym_jax/examples/configs/tuning_replan.cfg +1 -0
pyRDDLGym_jax/examples/configs/tuning_slp.cfg +1 -0
pyRDDLGym_jax/examples/run_plan.py +1 -1
pyRDDLGym_jax/examples/run_tune.py +8 -2
{pyrddlgym_jax-2.4.dist-info → pyrddlgym_jax-2.6.dist-info}/METADATA +17 -30
{pyrddlgym_jax-2.4.dist-info → pyrddlgym_jax-2.6.dist-info}/RECORD +19 -18
{pyrddlgym_jax-2.4.dist-info → pyrddlgym_jax-2.6.dist-info}/WHEEL +1 -1
{pyrddlgym_jax-2.4.dist-info → pyrddlgym_jax-2.6.dist-info}/entry_points.txt +0 -0
{pyrddlgym_jax-2.4.dist-info → pyrddlgym_jax-2.6.dist-info/licenses}/LICENSE +0 -0
{pyrddlgym_jax-2.4.dist-info → pyrddlgym_jax-2.6.dist-info}/top_level.txt +0 -0

pyRDDLGym_jax/core/planner.py CHANGED Viewed

@@ -39,6 +39,7 @@ import configparser
 from enum import Enum
 from functools import partial
 import os
+import pickle
 import sys
 import time
 import traceback
@@ -206,6 +207,13 @@ def _load_config(config, args):
                 pgpe_kwargs['optimizer'] = pgpe_optimizer
         planner_args['pgpe'] = getattr(sys.modules[__name__], pgpe_method)(**pgpe_kwargs)
+    # preprocessor settings
+    preproc_method = planner_args.get('preprocessor', None)
+    preproc_kwargs = planner_args.pop('preprocessor_kwargs', {})
+    if preproc_method is not None:
+        planner_args['preprocessor'] = getattr(
+            sys.modules[__name__], preproc_method)(**preproc_kwargs)
     # optimize call RNG key
     planner_key = train_args.get('key', None)
     if planner_key is not None:
@@ -229,13 +237,19 @@ def _load_config(config, args):
 def load_config(path: str) -> Tuple[Kwargs, ...]:
-    '''Loads a config file at the specified file path.'''
+    '''Loads a config file at the specified file path.
+    :param path: the path of the config file to load and parse
+    '''
     config, args = _parse_config_file(path)
     return _load_config(config, args)
 def load_config_from_string(value: str) -> Tuple[Kwargs, ...]:
-    '''Loads config file contents specified explicitly as a string value.'''
+    '''Loads config file contents specified explicitly as a string value.
+    :param value: the string in json format containing the config contents to parse
+    '''
     config, args = _parse_config_string(value)
     return _load_config(config, args)
@@ -258,6 +272,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
     def __init__(self, *args,
                  logic: Logic=FuzzyLogic(),
                  cpfs_without_grad: Optional[Set[str]]=None,
+                 print_warnings: bool=True,
                  **kwargs) -> None:
         '''Creates a new RDDL to Jax compiler, where operations that are not
         differentiable are converted to approximate forms that have defined gradients.
@@ -268,6 +283,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         to customize these operations
         :param cpfs_without_grad: which CPFs do not have gradients (use straight
         through gradient trick)
+        :param print_warnings: whether to print warnings
         :param *kwargs: keyword arguments to pass to base compiler
         '''
         super(JaxRDDLCompilerWithGrad, self).__init__(*args, **kwargs)
@@ -277,6 +293,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         if cpfs_without_grad is None:
             cpfs_without_grad = set()
         self.cpfs_without_grad = cpfs_without_grad
+        self.print_warnings = print_warnings
         # actions and CPFs must be continuous
         pvars_cast = set()
@@ -284,7 +301,7 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
             self.init_values[var] = np.asarray(values, dtype=self.REAL)
             if not np.issubdtype(np.result_type(values), np.floating):
                 pvars_cast.add(var)
-        if pvars_cast:
+        if self.print_warnings and pvars_cast:
             message = termcolor.colored(
                 f'[INFO] JAX gradient compiler will cast p-vars {pvars_cast} to float.',
                 'green')
@@ -314,12 +331,12 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
                 if cpf in self.cpfs_without_grad:
                     jax_cpfs[cpf] = self._jax_stop_grad(jax_cpfs[cpf])
-        if cpfs_cast:
+        if self.print_warnings and cpfs_cast:
             message = termcolor.colored(
                 f'[INFO] JAX gradient compiler will cast CPFs {cpfs_cast} to float.',
                 'green')
             print(message)
-        if self.cpfs_without_grad:
+        if self.print_warnings and self.cpfs_without_grad:
             message = termcolor.colored(
                 f'[INFO] Gradients will not flow through CPFs {self.cpfs_without_grad}.',
                 'green')
@@ -333,6 +350,100 @@ class JaxRDDLCompilerWithGrad(JaxRDDLCompiler):
         return arg
+# ***********************************************************************
+# ALL VERSIONS OF STATE PREPROCESSING FOR DRP
+#
+# - static normalization
+#
+# ***********************************************************************
+class Preprocessor(metaclass=ABCMeta):
+    '''Base class for all state preprocessors.'''
+    HYPERPARAMS_KEY = 'preprocessor__'
+    def __init__(self) -> None:
+        self._initializer = None
+        self._update = None
+        self._transform = None
+    @property
+    def initialize(self):
+        return self._initializer
+    @property
+    def update(self):
+        return self._update
+    @property
+    def transform(self):
+        return self._transform
+    @abstractmethod
+    def compile(self, compiled: JaxRDDLCompilerWithGrad) -> None:
+        pass
+class StaticNormalizer(Preprocessor):
+    '''Normalize values by box constraints on fluents computed from the RDDL domain.'''
+    def __init__(self, fluent_bounds: Dict[str, Tuple[np.ndarray, np.ndarray]]={}) -> None:
+        '''Create a new instance of the static normalizer.
+        :param fluent_bounds: optional bounds on fluents to overwrite default values.
+        '''
+        self.fluent_bounds = fluent_bounds
+    def compile(self, compiled: JaxRDDLCompilerWithGrad) -> None:
+        # adjust for partial observability
+        rddl = compiled.rddl
+        if rddl.observ_fluents:
+            observed_vars = rddl.observ_fluents
+        else:
+            observed_vars = rddl.state_fluents
+        # ignore boolean fluents and infinite bounds
+        bounded_vars = {}
+        for var in observed_vars:
+            if rddl.variable_ranges[var] != 'bool':
+                lower, upper = compiled.constraints.bounds[var]
+                if np.all(np.isfinite(lower) & np.isfinite(upper) & (lower < upper)):
+                    bounded_vars[var] = (lower, upper)
+                user_bounds = self.fluent_bounds.get(var, None)
+                if user_bounds is not None:
+                    bounded_vars[var] = tuple(user_bounds)
+        # initialize to ranges computed by the constraint parser
+        def _jax_wrapped_normalizer_init():
+            return bounded_vars
+        self._initializer = jax.jit(_jax_wrapped_normalizer_init)
+        # static bounds
+        def _jax_wrapped_normalizer_update(subs, stats):
+            stats = {var: (jnp.asarray(lower, dtype=compiled.REAL),
+                           jnp.asarray(upper, dtype=compiled.REAL))
+                     for (var, (lower, upper)) in bounded_vars.items()}
+            return stats
+        self._update = jax.jit(_jax_wrapped_normalizer_update)
+        # apply min max scaling
+        def _jax_wrapped_normalizer_transform(subs, stats):
+            new_subs = {}
+            for (var, values) in subs.items():
+                if var in stats:
+                    lower, upper = stats[var]
+                    new_dims = jnp.ndim(values) - jnp.ndim(lower)
+                    lower = lower[(jnp.newaxis,) * new_dims + (...,)]
+                    upper = upper[(jnp.newaxis,) * new_dims + (...,)]
+                    new_subs[var] = (values - lower) / (upper - lower)
+                else:
+                    new_subs[var] = values
+            return new_subs
+        self._transform = jax.jit(_jax_wrapped_normalizer_transform)
 # ***********************************************************************
 # ALL VERSIONS OF JAX PLANS
 #
@@ -358,7 +469,8 @@ class JaxPlan(metaclass=ABCMeta):
     @abstractmethod
     def compile(self, compiled: JaxRDDLCompilerWithGrad,
                 _bounds: Bounds,
-                horizon: int) -> None:
+                horizon: int,
+                preprocessor: Optional[Preprocessor]=None) -> None:
         pass
     @abstractmethod
@@ -436,10 +548,11 @@ class JaxPlan(metaclass=ABCMeta):
                                     ~lower_finite & upper_finite,
                                     ~lower_finite & ~upper_finite]
             bounds[name] = (lower, upper)
-            message = termcolor.colored(
-                f'[INFO] Bounds of action-fluent <{name}> set to {bounds[name]}.',
-                'green')
-            print(message)
+            if compiled.print_warnings:
+                message = termcolor.colored(
+                    f'[INFO] Bounds of action-fluent <{name}> set to {bounds[name]}.',
+                    'green')
+                print(message)
         return shapes, bounds, bounds_safe, cond_lists
     def _count_bool_actions(self, rddl: RDDLLiftedModel):
@@ -508,7 +621,8 @@ class JaxStraightLinePlan(JaxPlan):
     def compile(self, compiled: JaxRDDLCompilerWithGrad,
                 _bounds: Bounds,
-                horizon: int) -> None:
+                horizon: int,
+                preprocessor: Optional[Preprocessor]=None) -> None:
         rddl = compiled.rddl
         # calculate the correct action box bounds
@@ -519,7 +633,7 @@ class JaxStraightLinePlan(JaxPlan):
         # action concurrency check
         bool_action_count, allowed_actions = self._count_bool_actions(rddl)
         use_constraint_satisfaction = allowed_actions < bool_action_count
-        if use_constraint_satisfaction:
+        if compiled.print_warnings and use_constraint_satisfaction:
             message = termcolor.colored(
                 f'[INFO] SLP will use projected gradient to satisfy '
                 f'max_nondef_actions since total boolean actions '
@@ -596,7 +710,7 @@ class JaxStraightLinePlan(JaxPlan):
             return new_params, True
         # convert softmax action back to action dict
-        action_sizes = {var: np.prod(shape[1:], dtype=int)
+        action_sizes = {var: np.prod(shape[1:], dtype=np.int64)
                         for (var, shape) in shapes.items()
                         if ranges[var] == 'bool'}
@@ -605,7 +719,7 @@ class JaxStraightLinePlan(JaxPlan):
             start = 0
             for (name, size) in action_sizes.items():
                 action = output[..., start:start + size]
-                action = jnp.reshape(action, newshape=shapes[name][1:])
+                action = jnp.reshape(action, shapes[name][1:])
                 if noop[name]:
                     action = 1.0 - action
                 actions[name] = action
@@ -680,7 +794,7 @@ class JaxStraightLinePlan(JaxPlan):
                 scores = []
                 for (var, param) in params.items():
                     if ranges[var] == 'bool':
-                        param_flat = jnp.ravel(param)
+                        param_flat = jnp.ravel(param, order='C')
                         if noop[var]:
                             if wrap_sigmoid:
                                 param_flat = -param_flat
@@ -838,7 +952,7 @@ class JaxStraightLinePlan(JaxPlan):
     def guess_next_epoch(self, params: Pytree) -> Pytree:
         next_fn = JaxStraightLinePlan._guess_next_epoch
-        return jax.tree_map(next_fn, params)
+        return jax.tree_util.tree_map(next_fn, params)
 class JaxDeepReactivePolicy(JaxPlan):
@@ -897,7 +1011,8 @@ class JaxDeepReactivePolicy(JaxPlan):
     def compile(self, compiled: JaxRDDLCompilerWithGrad,
                 _bounds: Bounds,
-                horizon: int) -> None:
+                horizon: int,
+                preprocessor: Optional[Preprocessor]=None) -> None:
         rddl = compiled.rddl
         # calculate the correct action box bounds
@@ -928,7 +1043,7 @@ class JaxDeepReactivePolicy(JaxPlan):
         wrap_non_bool = self._wrap_non_bool
         init = self._initializer
         layers = list(enumerate(zip(self._topology, self._activations)))
-        layer_sizes = {var: np.prod(shape, dtype=int)
+        layer_sizes = {var: np.prod(shape, dtype=np.int64)
                        for (var, shape) in shapes.items()}
         layer_names = {var: f'output_{var}'.replace('-', '_') for var in shapes}
@@ -946,21 +1061,28 @@ class JaxDeepReactivePolicy(JaxPlan):
                 if ranges[var] != 'bool':
                     value_size = np.size(values)
                     if normalize_per_layer and value_size == 1:
-                        message = termcolor.colored(
-                            f'[WARN] Cannot apply layer norm to state-fluent <{var}> '
-                            f'of size 1: setting normalize_per_layer = False.', 'yellow')
-                        print(message)
+                        if compiled.print_warnings:
+                            message = termcolor.colored(
+                                f'[WARN] Cannot apply layer norm to state-fluent <{var}> '
+                                f'of size 1: setting normalize_per_layer = False.', 'yellow')
+                            print(message)
                         normalize_per_layer = False
                     non_bool_dims += value_size
             if not normalize_per_layer and non_bool_dims == 1:
-                message = termcolor.colored(
-                    '[WARN] Cannot apply layer norm to state-fluents of total size 1: '
-                    'setting normalize = False.', 'yellow')
-                print(message)
+                if compiled.print_warnings:
+                    message = termcolor.colored(
+                        '[WARN] Cannot apply layer norm to state-fluents of total size 1: '
+                        'setting normalize = False.', 'yellow')
+                    print(message)
                 normalize = False
         # convert subs dictionary into a state vector to feed to the MLP
-        def _jax_wrapped_policy_input(subs):
+        def _jax_wrapped_policy_input(subs, hyperparams):
+            # optional state preprocessing
+            if preprocessor is not None:
+                stats = hyperparams[preprocessor.HYPERPARAMS_KEY]
+                subs = preprocessor.transform(subs, stats)
             # concatenate all state variables into a single vector
             # optionally apply layer norm to each input tensor
@@ -968,7 +1090,7 @@ class JaxDeepReactivePolicy(JaxPlan):
             non_bool_dims = 0
             for (var, value) in subs.items():
                 if var in observed_vars:
-                    state = jnp.ravel(value)
+                    state = jnp.ravel(value, order='C')
                     if ranges[var] == 'bool':
                         states_bool.append(state)
                     else:
@@ -997,8 +1119,8 @@ class JaxDeepReactivePolicy(JaxPlan):
             return state
         # predict actions from the policy network for current state
-        def _jax_wrapped_policy_network_predict(subs):
-            state = _jax_wrapped_policy_input(subs)
+        def _jax_wrapped_policy_network_predict(subs, hyperparams):
+            state = _jax_wrapped_policy_input(subs, hyperparams)
             # feed state vector through hidden layers
             hidden = state
@@ -1054,7 +1176,7 @@ class JaxDeepReactivePolicy(JaxPlan):
             for (name, size) in layer_sizes.items():
                 if ranges[name] == 'bool':
                     action = output[..., start:start + size]
-                    action = jnp.reshape(action, newshape=shapes[name])
+                    action = jnp.reshape(action, shapes[name])
                     if noop[name]:
                         action = 1.0 - action
                     actions[name] = action
@@ -1063,7 +1185,7 @@ class JaxDeepReactivePolicy(JaxPlan):
         # train action prediction
         def _jax_wrapped_drp_predict_train(key, params, hyperparams, step, subs):
-            actions = predict_fn.apply(params, subs)
+            actions = predict_fn.apply(params, subs, hyperparams)
             if not wrap_non_bool:
                 for (var, action) in actions.items():
                     if var != bool_key and ranges[var] != 'bool':
@@ -1113,7 +1235,7 @@ class JaxDeepReactivePolicy(JaxPlan):
             subs = {var: value[0, ...]
                     for (var, value) in subs.items()
                     if var in observed_vars}
-            params = predict_fn.init(key, subs)
+            params = predict_fn.init(key, subs, hyperparams)
             return params
         self.initializer = _jax_wrapped_drp_init
@@ -1226,6 +1348,7 @@ class PGPE(metaclass=ABCMeta):
     @abstractmethod
     def compile(self, loss_fn: Callable, projection: Callable, real_dtype: Type,
+                print_warnings: bool,
                 parallel_updates: Optional[int]=None) -> None:
         pass
@@ -1322,6 +1445,7 @@ class GaussianPGPE(PGPE):
         )
     def compile(self, loss_fn: Callable, projection: Callable, real_dtype: Type,
+                print_warnings: bool,
                 parallel_updates: Optional[int]=None) -> None:
         sigma0 = self.init_sigma
         sigma_lo, sigma_hi = self.sigma_range
@@ -1347,7 +1471,7 @@ class GaussianPGPE(PGPE):
         def _jax_wrapped_pgpe_init(key, policy_params):
             mu = policy_params
-            sigma = jax.tree_map(partial(jnp.full_like, fill_value=sigma0), mu)
+            sigma = jax.tree_util.tree_map(partial(jnp.full_like, fill_value=sigma0), mu)
             pgpe_params = (mu, sigma)
             pgpe_opt_state = (mu_optimizer.init(mu), sigma_optimizer.init(sigma))
             r_max = -jnp.inf
@@ -1395,13 +1519,14 @@ class GaussianPGPE(PGPE):
             treedef = jax.tree_util.tree_structure(sigma)
             keys = random.split(key, num=treedef.num_leaves)
             keys_pytree = jax.tree_util.tree_unflatten(treedef=treedef, leaves=keys)
-            epsilon = jax.tree_map(_jax_wrapped_mu_noise, keys_pytree, sigma)
-            p1 = jax.tree_map(jnp.add, mu, epsilon)
-            p2 = jax.tree_map(jnp.subtract, mu, epsilon)
+            epsilon = jax.tree_util.tree_map(_jax_wrapped_mu_noise, keys_pytree, sigma)
+            p1 = jax.tree_util.tree_map(jnp.add, mu, epsilon)
+            p2 = jax.tree_util.tree_map(jnp.subtract, mu, epsilon)
             if super_symmetric:
-                epsilon_star = jax.tree_map(_jax_wrapped_epsilon_star, sigma, epsilon)
-                p3 = jax.tree_map(jnp.add, mu, epsilon_star)
-                p4 = jax.tree_map(jnp.subtract, mu, epsilon_star)
+                epsilon_star = jax.tree_util.tree_map(
+                    _jax_wrapped_epsilon_star, sigma, epsilon)
+                p3 = jax.tree_util.tree_map(jnp.add, mu, epsilon_star)
+                p4 = jax.tree_util.tree_map(jnp.subtract, mu, epsilon_star)
             else:
                 epsilon_star, p3, p4 = epsilon, p1, p2
             return p1, p2, p3, p4, epsilon, epsilon_star
@@ -1469,11 +1594,11 @@ class GaussianPGPE(PGPE):
                 r_max = jnp.maximum(r_max, r4)
             else:
                 r3, r4 = r1, r2
-            grad_mu = jax.tree_map(
+            grad_mu = jax.tree_util.tree_map(
                 partial(_jax_wrapped_mu_grad, r1=r1, r2=r2, r3=r3, r4=r4, m=r_max),
                 epsilon, epsilon_star
             )
-            grad_sigma = jax.tree_map(
+            grad_sigma = jax.tree_util.tree_map(
                 partial(_jax_wrapped_sigma_grad,
                         r1=r1, r2=r2, r3=r3, r4=r4, m=r_max, ent=ent),
                 epsilon, epsilon_star, sigma
@@ -1492,7 +1617,7 @@ class GaussianPGPE(PGPE):
                     _jax_wrapped_pgpe_grad,
                     in_axes=(0, None, None, None, None, None, None, None)
                 )(keys, mu, sigma, r_max, ent, policy_hyperparams, subs, model_params)
-                mu_grad, sigma_grad = jax.tree_map(
+                mu_grad, sigma_grad = jax.tree_util.tree_map(
                     partial(jnp.mean, axis=0), (mu_grads, sigma_grads))
                 new_r_max = jnp.max(r_maxs)
             return mu_grad, sigma_grad, new_r_max
@@ -1516,7 +1641,7 @@ class GaussianPGPE(PGPE):
                 sigma_grad, sigma_state, params=sigma)
             new_mu = optax.apply_updates(mu, mu_updates)
             new_sigma = optax.apply_updates(sigma, sigma_updates)
-            new_sigma = jax.tree_map(
+            new_sigma = jax.tree_util.tree_map(
                 partial(jnp.clip, min=sigma_lo, max=sigma_hi), new_sigma)
             return new_mu, new_sigma, new_mu_state, new_sigma_state
@@ -1537,7 +1662,7 @@ class GaussianPGPE(PGPE):
             if max_kl is not None:
                 old_mu_lr = new_mu_state.hyperparams['learning_rate']
                 old_sigma_lr = new_sigma_state.hyperparams['learning_rate']
-                kl_terms = jax.tree_map(
+                kl_terms = jax.tree_util.tree_map(
                     _jax_wrapped_pgpe_kl_term, new_mu, new_sigma, mu, sigma)
                 total_kl = jax.tree_util.tree_reduce(jnp.add, kl_terms)
                 kl_reduction = jnp.minimum(1.0, jnp.sqrt(max_kl / total_kl))
@@ -1618,12 +1743,21 @@ def mean_semivariance_utility(returns: jnp.ndarray, beta: float) -> float:
     return mu - 0.5 * beta * msv
+@jax.jit
+def sharpe_utility(returns: jnp.ndarray, risk_free: float) -> float:
+    return (jnp.mean(returns) - risk_free) / (jnp.std(returns) + 1e-10)
+@jax.jit
+def var_utility(returns: jnp.ndarray, alpha: float) -> float:
+    return jnp.percentile(returns, q=100 * alpha)
 @jax.jit
 def cvar_utility(returns: jnp.ndarray, alpha: float) -> float:
     var = jnp.percentile(returns, q=100 * alpha)
     mask = returns <= var
-    weights = mask / jnp.maximum(1, jnp.sum(mask))
-    return jnp.sum(returns * weights)
+    return jnp.sum(returns * mask) / jnp.maximum(1, jnp.sum(mask))
 # set of all currently valid built-in utility functions
@@ -1633,8 +1767,10 @@ UTILITY_LOOKUP = {
     'mean_std': mean_deviation_utility,
     'mean_semivar': mean_semivariance_utility,
     'mean_semidev': mean_semideviation_utility,
+    'sharpe': sharpe_utility,
     'entropic': entropic_utility,
     'exponential': entropic_utility,
+    'var': var_utility,
     'cvar': cvar_utility
 }
@@ -1672,7 +1808,9 @@ class JaxBackpropPlanner:
                  compile_non_fluent_exact: bool=True,
                  logger: Optional[Logger]=None,
                  dashboard_viz: Optional[Any]=None,
-                 parallel_updates: Optional[int]=None) -> None:
+                 print_warnings: bool=True,
+                 parallel_updates: Optional[int]=None,
+                 preprocessor: Optional[Preprocessor]=None) -> None:
         '''Creates a new gradient-based algorithm for optimizing action sequences
         (plan) in the given RDDL. Some operations will be converted to their
         differentiable counterparts; the specific operations can be customized
@@ -1712,7 +1850,9 @@ class JaxBackpropPlanner:
         :param logger: to log information about compilation to file
         :param dashboard_viz: optional visualizer object from the environment
         to pass to the dashboard to visualize the policy
+        :param print_warnings: whether to print warnings
         :param parallel_updates: how many optimizers to run independently in parallel
+        :param preprocessor: optional preprocessor for state inputs to plan
         '''
         self.rddl = rddl
         self.plan = plan
@@ -1737,6 +1877,8 @@ class JaxBackpropPlanner:
         self.noise_kwargs = noise_kwargs
         self.pgpe = pgpe
         self.use_pgpe = pgpe is not None
+        self.print_warnings = print_warnings
+        self.preprocessor = preprocessor
         # set optimizer
         try:
@@ -1789,7 +1931,11 @@ class JaxBackpropPlanner:
         self._jax_compile_rddl()
         self._jax_compile_optimizer()
-    def summarize_system(self) -> str:
+    @staticmethod
+    def summarize_system() -> str:
+        '''Returns a string containing information about the system, Python version
+        and jax-related packages that are relevant to the current planner.
+        '''
         try:
             jaxlib_version = jax._src.lib.version_str
         except Exception as _:
@@ -1818,6 +1964,9 @@ r"""
                 f'devices: {devices_short}\n')
     def summarize_relaxations(self) -> str:
+        '''Returns a summary table containing all non-differentiable operators
+        and their relaxations.
+        '''
         result = ''
         if self.compiled.model_params:
             result += ('Some RDDL operations are non-differentiable '
@@ -1834,6 +1983,9 @@ r"""
         return result
     def summarize_hyperparameters(self) -> str:
+        '''Returns a string summarizing the hyper-parameters of the current planner
+        instance.
+        '''
         result = (f'objective hyper-parameters:\n'
                   f'    utility_fn        ={self.utility.__name__}\n'
                   f'    utility args      ={self.utility_kwargs}\n'
@@ -1852,7 +2004,8 @@ r"""
                   f'    noise_kwargs      ={self.noise_kwargs}\n'
                   f'    batch_size_train  ={self.batch_size_train}\n'
                   f'    batch_size_test   ={self.batch_size_test}\n'
-                  f'    parallel_updates  ={self.parallel_updates}\n')
+                  f'    parallel_updates  ={self.parallel_updates}\n'
+                  f'    preprocessor      ={self.preprocessor}\n')
         result += str(self.plan)
         if self.use_pgpe:
             result += str(self.pgpe)
@@ -1873,7 +2026,8 @@ r"""
             logger=self.logger,
             use64bit=self.use64bit,
             cpfs_without_grad=self.cpfs_without_grad,
-            compile_non_fluent_exact=self.compile_non_fluent_exact
+            compile_non_fluent_exact=self.compile_non_fluent_exact,
+            print_warnings=self.print_warnings
         )
         self.compiled.compile(log_jax_expr=True, heading='RELAXED MODEL')
@@ -1887,10 +2041,15 @@ r"""
     def _jax_compile_optimizer(self):
+        # preprocessor
+        if self.preprocessor is not None:
+            self.preprocessor.compile(self.compiled)
         # policy
         self.plan.compile(self.compiled,
                           _bounds=self._action_bounds,
-                          horizon=self.horizon)
+                          horizon=self.horizon,
+                          preprocessor=self.preprocessor)
         self.train_policy = jax.jit(self.plan.train_policy)
         self.test_policy = jax.jit(self.plan.test_policy)
@@ -1898,14 +2057,16 @@ r"""
         train_rollouts = self.compiled.compile_rollouts(
             policy=self.plan.train_policy,
             n_steps=self.horizon,
-            n_batch=self.batch_size_train
+            n_batch=self.batch_size_train,
+            cache_path_info=self.preprocessor is not None
         )
         self.train_rollouts = train_rollouts
         test_rollouts = self.test_compiled.compile_rollouts(
             policy=self.plan.test_policy,
             n_steps=self.horizon,
-            n_batch=self.batch_size_test
+            n_batch=self.batch_size_test,
+            cache_path_info=False
         )
         self.test_rollouts = jax.jit(test_rollouts)
@@ -1922,7 +2083,8 @@ r"""
         # optimization
         self.update = self._jax_update(train_loss)
-        self.pytree_at = jax.jit(lambda tree, i: jax.tree_map(lambda x: x[i], tree))
+        self.pytree_at = jax.jit(
+            lambda tree, i: jax.tree_util.tree_map(lambda x: x[i], tree))
         # pgpe option
         if self.use_pgpe:
@@ -1930,6 +2092,7 @@ r"""
                 loss_fn=test_loss,
                 projection=self.plan.projection,
                 real_dtype=self.test_compiled.REAL,
+                print_warnings=self.print_warnings,
                 parallel_updates=self.parallel_updates
             )
             self.merge_pgpe = self._jax_merge_pgpe_jaxplan()
@@ -2010,7 +2173,7 @@ r"""
         # check if the gradients are all zeros
         def _jax_wrapped_zero_gradients(grad):
             leaves, _ = jax.tree_util.tree_flatten(
-                jax.tree_map(partial(jnp.allclose, b=0), grad))
+                jax.tree_util.tree_map(partial(jnp.allclose, b=0), grad))
             return jnp.all(jnp.asarray(leaves))
         # calculate the plan gradient w.r.t. return loss and update optimizer
@@ -2069,7 +2232,7 @@ r"""
             def select_fn(leaf1, leaf2):
                 expanded_mask = pgpe_mask[(...,) + (jnp.newaxis,) * (jnp.ndim(leaf1) - 1)]
                 return jnp.where(expanded_mask, leaf1, leaf2)
-            policy_params = jax.tree_map(select_fn, pgpe_param, policy_params)
+            policy_params = jax.tree_util.tree_map(select_fn, pgpe_param, policy_params)
             test_loss = jnp.where(pgpe_mask, pgpe_loss, test_loss)
             test_loss_smooth = jnp.where(pgpe_mask, pgpe_loss_smooth, test_loss_smooth)
             expanded_mask = pgpe_mask[(...,) + (jnp.newaxis,) * (jnp.ndim(converged) - 1)]
@@ -2091,7 +2254,9 @@ r"""
                     f'Variable <{name}> in subs argument is not a '
                     f'valid p-variable, must be one of '
                     f'{set(self.test_compiled.init_values.keys())}.')
-            value = np.reshape(value, newshape=np.shape(init_value))[np.newaxis, ...]
+            value = np.reshape(value, np.shape(init_value))[np.newaxis, ...]
+            if value.dtype.type is np.str_:
+                value = rddl.object_string_to_index_array(rddl.variable_ranges[name], value)
             train_value = np.repeat(value, repeats=n_train, axis=0)
             train_value = np.asarray(train_value, dtype=self.compiled.REAL)
             init_train[name] = train_value
@@ -2121,7 +2286,7 @@ r"""
                 x[np.newaxis, ...], shape=(self.parallel_updates,) + np.shape(x))
             return x
-        return jax.tree_map(make_batched, pytree)
+        return jax.tree_util.tree_map(make_batched, pytree)
     def as_optimization_problem(
             self, key: Optional[random.PRNGKey]=None,
@@ -2165,10 +2330,11 @@ r"""
         train_subs, _ = self._batched_init_subs(subs)
         model_params = self.compiled.model_params
         if policy_hyperparams is None:
-            message = termcolor.colored(
-                '[WARN] policy_hyperparams is not set, setting 1.0 for '
-                'all action-fluents which could be suboptimal.', 'yellow')
-            print(message)
+            if self.print_warnings:
+                message = termcolor.colored(
+                    '[WARN] policy_hyperparams is not set, setting 1.0 for '
+                    'all action-fluents which could be suboptimal.', 'yellow')
+                print(message)
             policy_hyperparams = {action: 1.0
                                   for action in self.rddl.action_fluents}
@@ -2318,10 +2484,11 @@ r"""
         # cannot run dashboard with parallel updates
         if dashboard is not None and self.parallel_updates is not None:
-            message = termcolor.colored(
-                '[WARN] Dashboard is unavailable if parallel_updates is not None: '
-                'setting dashboard to None.', 'yellow')
-            print(message)
+            if self.print_warnings:
+                message = termcolor.colored(
+                    '[WARN] Dashboard is unavailable if parallel_updates is not None: '
+                    'setting dashboard to None.', 'yellow')
+                print(message)
             dashboard = None
         # if PRNG key is not provided
@@ -2331,19 +2498,21 @@ r"""
         # if policy_hyperparams is not provided
         if policy_hyperparams is None:
-            message = termcolor.colored(
-                '[WARN] policy_hyperparams is not set, setting 1.0 for '
-                'all action-fluents which could be suboptimal.', 'yellow')
-            print(message)
+            if self.print_warnings:
+                message = termcolor.colored(
+                    '[WARN] policy_hyperparams is not set, setting 1.0 for '
+                    'all action-fluents which could be suboptimal.', 'yellow')
+                print(message)
             policy_hyperparams = {action: 1.0
                                   for action in self.rddl.action_fluents}
         # if policy_hyperparams is a scalar
         elif isinstance(policy_hyperparams, (int, float, np.number)):
-            message = termcolor.colored(
-                f'[INFO] policy_hyperparams is {policy_hyperparams}, '
-                f'setting this value for all action-fluents.', 'green')
-            print(message)
+            if self.print_warnings:
+                message = termcolor.colored(
+                    f'[INFO] policy_hyperparams is {policy_hyperparams}, '
+                    f'setting this value for all action-fluents.', 'green')
+                print(message)
             hyperparam_value = float(policy_hyperparams)
             policy_hyperparams = {action: hyperparam_value
                                   for action in self.rddl.action_fluents}
@@ -2352,13 +2521,20 @@ r"""
         elif isinstance(policy_hyperparams, dict):
             for action in self.rddl.action_fluents:
                 if action not in policy_hyperparams:
-                    message = termcolor.colored(
-                        f'[WARN] policy_hyperparams[{action}] is not set, '
-                        f'setting 1.0 for missing action-fluents '
-                        f'which could be suboptimal.', 'yellow')
-                    print(message)
+                    if self.print_warnings:
+                        message = termcolor.colored(
+                            f'[WARN] policy_hyperparams[{action}] is not set, '
+                            f'setting 1.0 for missing action-fluents '
+                            f'which could be suboptimal.', 'yellow')
+                        print(message)
                     policy_hyperparams[action] = 1.0
+        # initialize preprocessor
+        preproc_key = None
+        if self.preprocessor is not None:
+            preproc_key = self.preprocessor.HYPERPARAMS_KEY
+            policy_hyperparams[preproc_key] = self.preprocessor.initialize()
         # print summary of parameters:
         if print_summary:
             print(self.summarize_system())
@@ -2396,7 +2572,7 @@ r"""
                 if var not in subs:
                     subs[var] = value
                     added_pvars_to_subs.append(var)
-            if added_pvars_to_subs:
+            if self.print_warnings and added_pvars_to_subs:
                 message = termcolor.colored(
                     f'[INFO] p-variables {added_pvars_to_subs} is not in '
                     f'provided subs, using their initial values.', 'green')
@@ -2485,6 +2661,11 @@ r"""
                  subkey, policy_params, policy_hyperparams, train_subs, model_params,
                  opt_state, opt_aux)
+            # update the preprocessor
+            if self.preprocessor is not None:
+                policy_hyperparams[preproc_key] = self.preprocessor.update(
+                    train_log['fluents'], policy_hyperparams[preproc_key])
             # evaluate
             test_loss, (test_log, model_params_test) = self.test_loss(
                 subkey, policy_params, policy_hyperparams, test_subs, model_params_test)
@@ -2637,6 +2818,7 @@ r"""
                 'model_params': model_params,
                 'progress': progress_percent,
                 'train_log': train_log,
+                'policy_hyperparams': policy_hyperparams,
                 **test_log
             }
@@ -2648,7 +2830,7 @@ r"""
                     policy_params, opt_state, opt_aux = self.initialize(
                         subkey, policy_hyperparams, train_subs)
                     no_progress_count = 0
-                    if progress_bar is not None:
+                    if self.print_warnings and progress_bar is not None:
                         message = termcolor.colored(
                             f'[INFO] Optimizer restarted at iteration {it} '
                             f'due to lack of progress.', 'green')
@@ -2658,7 +2840,7 @@ r"""
             # stopping condition reached
             if stopping_rule is not None and stopping_rule.monitor(callback):
-                if progress_bar is not None:
+                if self.print_warnings and progress_bar is not None:
                     message = termcolor.colored(
                         '[SUCC] Stopping rule has been reached.', 'green')
                     progress_bar.write(message)
@@ -2699,7 +2881,8 @@ r"""
         # summarize and test for convergence
         if print_summary:
-            grad_norm = jax.tree_map(lambda x: np.linalg.norm(x).item(), best_grad)
+            grad_norm = jax.tree_util.tree_map(
+                lambda x: np.linalg.norm(x).item(), best_grad)
             diagnosis = self._perform_diagnosis(
                 last_iter_improve, -np.min(train_loss), -np.min(test_loss_smooth),
                 -best_loss, grad_norm)
@@ -2713,7 +2896,8 @@ r"""
     def _perform_diagnosis(self, last_iter_improve,
                            train_return, test_return, best_return, grad_norm):
-        max_grad_norm = max(jax.tree_util.tree_leaves(grad_norm))
+        grad_norms = jax.tree_util.tree_leaves(grad_norm)
+        max_grad_norm = max(grad_norms) if grad_norms else np.nan
         grad_is_zero = np.allclose(max_grad_norm, 0)
         # divergence if the solution is not finite
@@ -2777,6 +2961,7 @@ r"""
         :param policy_hyperparams: hyper-parameters for the policy/plan, such as
         weights for sigmoid wrapping boolean actions (optional)
         '''
+        subs = subs.copy()
         # check compatibility of the subs dictionary
         for (var, values) in subs.items():
@@ -2795,13 +2980,17 @@ r"""
                 if step == 0 and var in self.rddl.observ_fluents:
                     subs[var] = self.test_compiled.init_values[var]
                 else:
-                    raise ValueError(
-                        f'Values {values} assigned to p-variable <{var}> are '
-                        f'non-numeric of type {dtype}.')
+                    if dtype.type is np.str_:
+                        prange = self.rddl.variable_ranges[var]
+                        subs[var] = self.rddl.object_string_to_index_array(prange, subs[var])
+                    else:
+                        raise ValueError(
+                            f'Values {values} assigned to p-variable <{var}> are '
+                            f'non-numeric of type {dtype}.')
         # cast device arrays to numpy
         actions = self.test_policy(key, params, policy_hyperparams, step, subs)
-        actions = jax.tree_map(np.asarray, actions)
+        actions = jax.tree_util.tree_map(np.asarray, actions)
         return actions
@@ -2822,8 +3011,9 @@ class JaxOfflineController(BaseAgent):
     def __init__(self, planner: JaxBackpropPlanner,
                  key: Optional[random.PRNGKey]=None,
                  eval_hyperparams: Optional[Dict[str, Any]]=None,
-                 params: Optional[Pytree]=None,
+                 params: Optional[Union[str, Pytree]]=None,
                  train_on_reset: bool=False,
+                 save_path: Optional[str]=None,
                  **train_kwargs) -> None:
         '''Creates a new JAX offline control policy that is trained once, then
         deployed later.
@@ -2834,8 +3024,10 @@ class JaxOfflineController(BaseAgent):
         :param eval_hyperparams: policy hyperparameters to apply for evaluation
         or whenever sample_action is called
         :param params: use the specified policy parameters instead of calling
-        planner.optimize()
+        planner.optimize(); can be a string pointing to a valid file path where params
+        have been saved, or a pytree of parameters
         :param train_on_reset: retrain policy parameters on every episode reset
+        :param save_path: optional path to save parameters to
         :param **train_kwargs: any keyword arguments to be passed to the planner
         for optimization
         '''
@@ -2847,13 +3039,28 @@ class JaxOfflineController(BaseAgent):
         self.train_on_reset = train_on_reset
         self.train_kwargs = train_kwargs
         self.params_given = params is not None
+        self.hyperparams_given = eval_hyperparams is not None
+        # load the policy from file
+        if not self.train_on_reset and params is not None and isinstance(params, str):
+            with open(params, 'rb') as file:
+                params = pickle.load(file)
+        # train the policy
         self.step = 0
         self.callback = None
         if not self.train_on_reset and not self.params_given:
             callback = self.planner.optimize(key=self.key, **self.train_kwargs)
             self.callback = callback
             params = callback['best_params']
+            if not self.hyperparams_given:
+                self.eval_hyperparams = callback['policy_hyperparams']
+            # save the policy
+            if save_path is not None:
+                with open(save_path, 'wb') as file:
+                    pickle.dump(params, file)
         self.params = params
     def sample_action(self, state: Dict[str, Any]) -> Dict[str, Any]:
@@ -2865,10 +3072,14 @@ class JaxOfflineController(BaseAgent):
     def reset(self) -> None:
         self.step = 0
+        # train the policy if required to reset at the start of every episode
         if self.train_on_reset and not self.params_given:
             callback = self.planner.optimize(key=self.key, **self.train_kwargs)
             self.callback = callback
             self.params = callback['best_params']
+            if not self.hyperparams_given:
+                self.eval_hyperparams = callback['policy_hyperparams']
 class JaxOnlineController(BaseAgent):
@@ -2901,6 +3112,7 @@ class JaxOnlineController(BaseAgent):
             key = random.PRNGKey(round(time.time() * 1000))
         self.key = key
         self.eval_hyperparams = eval_hyperparams
+        self.hyperparams_given = eval_hyperparams is not None
         self.warm_start = warm_start
         self.train_kwargs = train_kwargs
         self.max_attempts = max_attempts
@@ -2915,18 +3127,24 @@ class JaxOnlineController(BaseAgent):
         attempts = 0
         while attempts < self.max_attempts and callback['iteration'] <= 1:
             attempts += 1
-            message = termcolor.colored(
-                f'[WARN] JIT compilation dominated the execution time: '
-                f'executing the optimizer again on the traced model [attempt {attempts}].',
-                'yellow')
-            print(message)
+            if self.planner.print_warnings:
+                message = termcolor.colored(
+                    f'[WARN] JIT compilation dominated the execution time: '
+                    f'executing the optimizer again on the traced model '
+                    f'[attempt {attempts}].', 'yellow')
+                print(message)
             callback = planner.optimize(
-                key=self.key, guess=self.guess, subs=state, **self.train_kwargs)
+                key=self.key, guess=self.guess, subs=state, **self.train_kwargs)
         self.callback = callback
         params = callback['best_params']
+        if not self.hyperparams_given:
+            self.eval_hyperparams = callback['policy_hyperparams']
+        # get the action from the parameters for the current state
         self.key, subkey = random.split(self.key)
         actions = planner.get_action(subkey, params, 0, state, self.eval_hyperparams)
+        # apply warm start for the next epoch
         if self.warm_start:
             self.guess = planner.plan.guess_next_epoch(params)
         return actions

pyRDDLGym-jax 2.4__py3-none-any.whl → 2.6__py3-none-any.whl

pyRDDLGym-jax 2.4py3-none-any.whl → 2.6py3-none-any.whl