PyPI - keras-rs-nightly - Versions diffs - 0.2.2.dev202508190331__py3-none-any.whl → 0.3.1.dev202512130338__py3-none-any.whl - Mend

keras-rs-nightly 0.2.2.dev202508190331py3-none-any.whl → 0.3.1.dev202512130338py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

keras_rs/losses/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ This file was autogenerated. Do not edit it by hand,
 since your modifications would be overwritten.
 """
+from keras_rs.src.losses.list_mle_loss import ListMLELoss as ListMLELoss
 from keras_rs.src.losses.pairwise_hinge_loss import (
     PairwiseHingeLoss as PairwiseHingeLoss,
 )

keras_rs/src/layers/embedding/base_distributed_embedding.py CHANGED Viewed

@@ -822,13 +822,13 @@ class DistributedEmbedding(keras.layers.Layer):
         table_stacking: str | Sequence[Sequence[str]],
     ) -> None:
         del table_stacking
-        table_to_embedding_layer: dict[TableConfig, EmbedReduce] = {}
+        table_config_id_to_embedding_layer: dict[int, EmbedReduce] = {}
         self._default_device_embedding_layers: dict[str, EmbedReduce] = {}
         for path, feature_config in feature_configs.items():
-            if feature_config.table in table_to_embedding_layer:
+            if id(feature_config.table) in table_config_id_to_embedding_layer:
                 self._default_device_embedding_layers[path] = (
-                    table_to_embedding_layer[feature_config.table]
+                    table_config_id_to_embedding_layer[id(feature_config.table)]
                 )
             else:
                 embedding_layer = EmbedReduce(
@@ -838,7 +838,9 @@ class DistributedEmbedding(keras.layers.Layer):
                     embeddings_initializer=feature_config.table.initializer,
                     combiner=feature_config.table.combiner,
                 )
-                table_to_embedding_layer[feature_config.table] = embedding_layer
+                table_config_id_to_embedding_layer[id(feature_config.table)] = (
+                    embedding_layer
+                )
                 self._default_device_embedding_layers[path] = embedding_layer
     def _default_device_build(
@@ -1013,8 +1015,8 @@ class DistributedEmbedding(keras.layers.Layer):
         # The serialized `TableConfig` objects.
         table_config_dicts: list[dict[str, Any]] = []
-        # Mapping from `TableConfig` to index in `table_config_dicts`.
-        table_config_indices: dict[TableConfig, int] = {}
+        # Mapping from `TableConfig` id to index in `table_config_dicts`.
+        table_config_id_to_index: dict[int, int] = {}
         def serialize_feature_config(
             feature_config: FeatureConfig,
@@ -1024,17 +1026,17 @@ class DistributedEmbedding(keras.layers.Layer):
             # key.
             feature_config_dict = feature_config.get_config()
-            if feature_config.table not in table_config_indices:
+            if id(feature_config.table) not in table_config_id_to_index:
                 # Save the serialized `TableConfig` the first time we see it and
                 # remember its index.
-                table_config_indices[feature_config.table] = len(
+                table_config_id_to_index[id(feature_config.table)] = len(
                     table_config_dicts
                 )
                 table_config_dicts.append(feature_config_dict["table"])
             # Replace the serialized `TableConfig` with its index.
-            feature_config_dict["table"] = table_config_indices[
-                feature_config.table
+            feature_config_dict["table"] = table_config_id_to_index[
+                id(feature_config.table)
             ]
             return feature_config_dict

keras_rs/src/layers/embedding/distributed_embedding_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ from keras_rs.src.api_export import keras_rs_export
 @keras_rs_export("keras_rs.layers.TableConfig")
-@dataclasses.dataclass(eq=True, unsafe_hash=True, order=True)
+@dataclasses.dataclass(order=True)
 class TableConfig:
     """Configuration for one embedding table.
@@ -88,7 +88,7 @@ class TableConfig:
 @keras_rs_export("keras_rs.layers.FeatureConfig")
-@dataclasses.dataclass(eq=True, unsafe_hash=True, order=True)
+@dataclasses.dataclass(order=True)
 class FeatureConfig:
     """Configuration for one embedding feature.

keras_rs/src/layers/embedding/jax/distributed_embedding.py CHANGED Viewed

@@ -9,13 +9,13 @@ import keras
 import numpy as np
 from jax import numpy as jnp
 from jax.experimental import layout as jax_layout
+from jax.experimental import multihost_utils
 from jax_tpu_embedding.sparsecore.lib.nn import embedding
 from jax_tpu_embedding.sparsecore.lib.nn import embedding_spec
 from jax_tpu_embedding.sparsecore.lib.nn import (
     table_stacking as jte_table_stacking,
 )
 from jax_tpu_embedding.sparsecore.utils import utils as jte_utils
-from keras.src import backend
 from keras_rs.src import types
 from keras_rs.src.layers.embedding import base_distributed_embedding
@@ -28,9 +28,14 @@ from keras_rs.src.layers.embedding.jax import embedding_utils
 from keras_rs.src.types import Nested
 from keras_rs.src.utils import keras_utils
+if jax.__version_info__ >= (0, 8, 0):
+    from jax import shard_map
+else:
+    from jax.experimental.shard_map import shard_map  # type: ignore[assignment]
 ArrayLike = Union[np.ndarray[Any, Any], jax.Array]
 FeatureConfig = config.FeatureConfig
-shard_map = jax.experimental.shard_map.shard_map  # type: ignore[attr-defined]
 def _get_partition_spec(
@@ -247,23 +252,6 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         )
         return sparsecore_distribution, sparsecore_layout
-    def _create_cpu_distribution(
-        self, cpu_axis_name: str = "cpu"
-    ) -> tuple[
-        keras.distribution.ModelParallel, keras.distribution.TensorLayout
-    ]:
-        """Share a variable across all CPU processes."""
-        cpu_devices = jax.devices("cpu")
-        device_mesh = keras.distribution.DeviceMesh(
-            (len(cpu_devices),), [cpu_axis_name], cpu_devices
-        )
-        replicated_layout = keras.distribution.TensorLayout([], device_mesh)
-        layout_map = keras.distribution.LayoutMap(device_mesh=device_mesh)
-        cpu_distribution = keras.distribution.ModelParallel(
-            layout_map=layout_map
-        )
-        return cpu_distribution, replicated_layout
     def _add_sparsecore_weight(
         self,
         name: str,
@@ -283,7 +271,7 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         table_specs: Sequence[embedding_spec.TableSpec],
         num_shards: int,
         add_slot_variables: bool,
-    ) -> tuple[keras.Variable, tuple[keras.Variable, ...] | None]:
+    ) -> embedding.EmbeddingVariables:
         stacked_table_spec = typing.cast(
             embedding_spec.StackedTableSpec, table_specs[0].stacked_table_spec
         )
@@ -352,7 +340,7 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
                 slot_initializers, slot_variables
             )
-        return table_variable, slot_variables
+        return embedding.EmbeddingVariables(table_variable, slot_variables)
     @keras_utils.no_automatic_dependency_tracking
     def _sparsecore_init(
@@ -405,11 +393,6 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         self._sparsecore_layout = sparsecore_layout
         self._sparsecore_distribution = sparsecore_distribution
-        # Distribution for CPU operations.
-        cpu_distribution, cpu_layout = self._create_cpu_distribution()
-        self._cpu_distribution = cpu_distribution
-        self._cpu_layout = cpu_layout
         mesh = sparsecore_distribution.device_mesh.backend_mesh
         global_device_count = mesh.devices.size
         num_sc_per_device = jte_utils.num_sparsecores_per_device(
@@ -464,12 +447,51 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         )
         # Collect all stacked tables.
-        table_specs = embedding_utils.get_table_specs(feature_specs)
-        table_stacks = embedding_utils.get_table_stacks(table_specs)
-        stacked_table_specs = {
-            stack_name: stack[0].stacked_table_spec
-            for stack_name, stack in table_stacks.items()
-        }
+        table_specs = embedding.get_table_specs(feature_specs)
+        table_stacks = jte_table_stacking.get_table_stacks(table_specs)
+        # Update stacked table stats to max of values across involved tables.
+        max_ids_per_partition = {}
+        max_unique_ids_per_partition = {}
+        required_buffer_size_per_device = {}
+        id_drop_counters = {}
+        for stack_name, stack in table_stacks.items():
+            max_ids_per_partition[stack_name] = np.max(
+                np.asarray(
+                    [s.max_ids_per_partition for s in stack], dtype=np.int32
+                )
+            )
+            max_unique_ids_per_partition[stack_name] = np.max(
+                np.asarray(
+                    [s.max_unique_ids_per_partition for s in stack],
+                    dtype=np.int32,
+                )
+            )
+            # Only set the suggested buffer size if set on any individual table.
+            valid_buffer_sizes = [
+                s.suggested_coo_buffer_size_per_device
+                for s in stack
+                if s.suggested_coo_buffer_size_per_device is not None
+            ]
+            if valid_buffer_sizes:
+                required_buffer_size_per_device[stack_name] = np.max(
+                    np.asarray(valid_buffer_sizes, dtype=np.int32)
+                )
+            id_drop_counters[stack_name] = 0
+        aggregated_stats = embedding.SparseDenseMatmulInputStats(
+            max_ids_per_partition=max_ids_per_partition,
+            max_unique_ids_per_partition=max_unique_ids_per_partition,
+            required_buffer_size_per_sc=required_buffer_size_per_device,
+            id_drop_counters=id_drop_counters,
+        )
+        embedding.update_preprocessing_parameters(
+            feature_specs,
+            aggregated_stats,
+            num_sc_per_device,
+        )
         # Create variables for all stacked tables and slot variables.
         with sparsecore_distribution.scope():
@@ -502,50 +524,6 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
             )
             self._iterations.overwrite_with_gradient = True
-        with cpu_distribution.scope():
-            # Create variables to track static buffer size and max IDs for each
-            # table during preprocessing.  These variables are shared across all
-            # processes on CPU.  We don't add these via `add_weight` because we
-            # can't have them passed to the training function.
-            replicated_zeros_initializer = ShardedInitializer(
-                "zeros", cpu_layout
-            )
-            with backend.name_scope(self.name, caller=self):
-                self._preprocessing_buffer_size = {
-                    table_name: backend.Variable(
-                        initializer=replicated_zeros_initializer,
-                        shape=(),
-                        dtype=backend.standardize_dtype("int32"),
-                        trainable=False,
-                        name=table_name + ":preprocessing:buffer_size",
-                    )
-                    for table_name in stacked_table_specs.keys()
-                }
-                self._preprocessing_max_unique_ids_per_partition = {
-                    table_name: backend.Variable(
-                        shape=(),
-                        name=table_name
-                        + ":preprocessing:max_unique_ids_per_partition",
-                        initializer=replicated_zeros_initializer,
-                        dtype=backend.standardize_dtype("int32"),
-                        trainable=False,
-                    )
-                    for table_name in stacked_table_specs.keys()
-                }
-                self._preprocessing_max_ids_per_partition = {
-                    table_name: backend.Variable(
-                        shape=(),
-                        name=table_name
-                        + ":preprocessing:max_ids_per_partition",
-                        initializer=replicated_zeros_initializer,
-                        dtype=backend.standardize_dtype("int32"),
-                        trainable=False,
-                    )
-                    for table_name in stacked_table_specs.keys()
-                }
         self._config = jte_embedding_lookup.EmbeddingLookupConfiguration(
             feature_specs,
             mesh=mesh,
@@ -586,10 +564,8 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         del inputs, weights, training
         # Each stacked-table gets a ShardedCooMatrix.
-        table_specs = embedding_utils.get_table_specs(
-            self._config.feature_specs
-        )
-        table_stacks = embedding_utils.get_table_stacks(table_specs)
+        table_specs = embedding.get_table_specs(self._config.feature_specs)
+        table_stacks = jte_table_stacking.get_table_stacks(table_specs)
         stacked_table_specs = {
             stack_name: stack[0].stacked_table_spec
             for stack_name, stack in table_stacks.items()
@@ -660,125 +636,74 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
             mesh.devices.item(0)
         )
-        # Get current buffer size/max_ids.
-        previous_max_ids_per_partition = keras.tree.map_structure(
-            lambda max_ids_per_partition: max_ids_per_partition.value.item(),
-            self._preprocessing_max_ids_per_partition,
-        )
-        previous_max_unique_ids_per_partition = keras.tree.map_structure(
-            lambda max_unique_ids_per_partition: (
-                max_unique_ids_per_partition.value.item()
-            ),
-            self._preprocessing_max_unique_ids_per_partition,
-        )
-        previous_buffer_size = keras.tree.map_structure(
-            lambda buffer_size: buffer_size.value.item(),
-            self._preprocessing_buffer_size,
-        )
         preprocessed, stats = embedding_utils.stack_and_shard_samples(
             self._config.feature_specs,
             samples,
             local_device_count,
             global_device_count,
             num_sc_per_device,
-            static_buffer_size=previous_buffer_size,
         )
-        # Extract max unique IDs and buffer sizes.
-        # We need to replicate this value across all local CPU devices.
         if training:
-            num_local_cpu_devices = jax.local_device_count("cpu")
-            local_max_ids_per_partition = {
-                table_name: np.repeat(
-                    # Maximum across all partitions and previous max.
-                    np.maximum(
-                        np.max(elems),
-                        previous_max_ids_per_partition[table_name],
-                    ),
-                    num_local_cpu_devices,
-                )
-                for table_name, elems in stats.max_ids_per_partition.items()
-            }
-            local_max_unique_ids_per_partition = {
-                name: np.repeat(
-                    # Maximum across all partitions and previous max.
-                    np.maximum(
-                        np.max(elems),
-                        previous_max_unique_ids_per_partition[name],
-                    ),
-                    num_local_cpu_devices,
-                )
-                for name, elems in stats.max_unique_ids_per_partition.items()
-            }
-            local_buffer_size = {
-                table_name: np.repeat(
-                    np.maximum(
-                        np.max(
-                            # Round values up to the next multiple of 8.
-                            # Currently using this as a proxy for the actual
-                            # required buffer size.
-                            ((elems + 7) // 8) * 8
-                        )
-                        * global_device_count
-                        * num_sc_per_device
-                        * local_device_count
-                        * num_sc_per_device,
-                        previous_buffer_size[table_name],
-                    ),
-                    num_local_cpu_devices,
-                )
-                for table_name, elems in stats.max_ids_per_partition.items()
-            }
+            # Synchronize input statistics across all devices and update the
+            # underlying stacked tables specs in the feature specs.
-            # Aggregate variables across all processes/devices.
-            max_across_cpus = jax.pmap(
-                lambda x: jax.lax.pmax(  # type: ignore[no-untyped-call]
-                    x, "all_cpus"
-                ),
-                axis_name="all_cpus",
-                devices=self._cpu_layout.device_mesh.backend_mesh.devices,
-            )
-            new_max_ids_per_partition = max_across_cpus(
-                local_max_ids_per_partition
+            # Gather stats across all processes/devices via process_allgather.
+            all_stats = multihost_utils.process_allgather(stats)
+            all_stats = jax.tree.map(np.max, all_stats)
+            # Check if stats changed enough to warrant action.
+            stacked_table_specs = embedding.get_stacked_table_specs(
+                self._config.feature_specs
             )
-            new_max_unique_ids_per_partition = max_across_cpus(
-                local_max_unique_ids_per_partition
+            changed = any(
+                all_stats.max_ids_per_partition[stack_name]
+                > spec.max_ids_per_partition
+                or all_stats.max_unique_ids_per_partition[stack_name]
+                > spec.max_unique_ids_per_partition
+                or all_stats.required_buffer_size_per_sc[stack_name]
+                * num_sc_per_device
+                > (spec.suggested_coo_buffer_size_per_device or 0)
+                for stack_name, spec in stacked_table_specs.items()
             )
-            new_buffer_size = max_across_cpus(local_buffer_size)
-            # Assign new preprocessing parameters.
-            with self._cpu_distribution.scope():
-                # For each process, all max ids/buffer sizes are replicated
-                # across all local devices.  Take the value from the first
-                # device.
-                keras.tree.map_structure(
-                    lambda var, values: var.assign(values[0]),
-                    self._preprocessing_max_ids_per_partition,
-                    new_max_ids_per_partition,
-                )
-                keras.tree.map_structure(
-                    lambda var, values: var.assign(values[0]),
-                    self._preprocessing_max_unique_ids_per_partition,
-                    new_max_unique_ids_per_partition,
-                )
-                keras.tree.map_structure(
-                    lambda var, values: var.assign(values[0]),
-                    self._preprocessing_buffer_size,
-                    new_buffer_size,
-                )
-                # Update parameters in the underlying feature specs.
-                int_max_ids_per_partition = keras.tree.map_structure(
-                    lambda varray: varray.item(), new_max_ids_per_partition
-                )
-                int_max_unique_ids_per_partition = keras.tree.map_structure(
-                    lambda varray: varray.item(),
-                    new_max_unique_ids_per_partition,
+            # Update configuration and repeat preprocessing if stats changed.
+            if changed:
+                for stack_name, spec in stacked_table_specs.items():
+                    all_stats.max_ids_per_partition[stack_name] = np.max(
+                        [
+                            all_stats.max_ids_per_partition[stack_name],
+                            spec.max_ids_per_partition,
+                        ]
+                    )
+                    all_stats.max_unique_ids_per_partition[stack_name] = np.max(
+                        [
+                            all_stats.max_unique_ids_per_partition[stack_name],
+                            spec.max_unique_ids_per_partition,
+                        ]
+                    )
+                    all_stats.required_buffer_size_per_sc[stack_name] = np.max(
+                        [
+                            all_stats.required_buffer_size_per_sc[stack_name],
+                            (
+                                (spec.suggested_coo_buffer_size_per_device or 0)
+                                + (num_sc_per_device - 1)
+                            )
+                            // num_sc_per_device,
+                        ]
+                    )
+                embedding.update_preprocessing_parameters(
+                    self._config.feature_specs, all_stats, num_sc_per_device
                 )
-                embedding_utils.update_stacked_table_specs(
+                # Re-execute preprocessing with consistent input statistics.
+                preprocessed, _ = embedding_utils.stack_and_shard_samples(
                     self._config.feature_specs,
-                    int_max_ids_per_partition,
-                    int_max_unique_ids_per_partition,
+                    samples,
+                    local_device_count,
+                    global_device_count,
+                    num_sc_per_device,
                 )
         return {"inputs": preprocessed}
@@ -826,19 +751,22 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
             raise ValueError("Layer must first be built before setting tables.")
         if "default_device" in self._placement_to_path_to_feature_config:
-            table_to_embedding_layer = {}
+            table_name_to_embedding_layer = {}
             for (
                 path,
                 feature_config,
             ) in self._placement_to_path_to_feature_config[
                 "default_device"
             ].items():
-                table_to_embedding_layer[feature_config.table] = (
+                table_name_to_embedding_layer[feature_config.table.name] = (
                     self._default_device_embedding_layers[path]
                 )
-            for table, embedding_layer in table_to_embedding_layer.items():
-                table_values = tables.get(table.name, None)
+            for (
+                table_name,
+                embedding_layer,
+            ) in table_name_to_embedding_layer.items():
+                table_values = tables.get(table_name, None)
                 if table_values is not None:
                     if embedding_layer.lora_enabled:
                         raise ValueError("Cannot set table if LoRA is enabled.")
@@ -851,8 +779,8 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         config = self._config
         num_table_shards = config.mesh.devices.size * config.num_sc_per_device
-        table_specs = embedding_utils.get_table_specs(config.feature_specs)
-        sharded_tables = embedding_utils.stack_and_shard_tables(
+        table_specs = embedding.get_table_specs(config.feature_specs)
+        sharded_tables = jte_table_stacking.stack_and_shard_tables(
             table_specs,
             tables,
             num_table_shards,
@@ -871,8 +799,8 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         # Assign stacked table variables to the device values.
         keras.tree.map_structure_up_to(
             device_tables,
-            lambda table_and_slot_variables,
-            table_value: table_and_slot_variables[0].assign(table_value),
+            lambda embedding_variables,
+            table_value: embedding_variables.table.assign(table_value),
             self._table_and_slot_variables,
             device_tables,
         )
@@ -883,17 +811,19 @@ class DistributedEmbedding(base_distributed_embedding.DistributedEmbedding):
         config = self._config
         num_table_shards = config.mesh.devices.size * config.num_sc_per_device
-        table_specs = embedding_utils.get_table_specs(config.feature_specs)
+        table_specs = embedding.get_table_specs(config.feature_specs)
         # Extract only the table variables, not the gradient slot variables.
         table_variables = {
-            name: jax.device_get(table_and_slots[0].value)
-            for name, table_and_slots in self._table_and_slot_variables.items()
+            name: jax.device_get(embedding_variables.table.value)
+            for name, embedding_variables in (
+                self._table_and_slot_variables.items()
+            )
         }
         return typing.cast(
             dict[str, ArrayLike],
-            embedding_utils.unshard_and_unstack_tables(
+            jte_table_stacking.unshard_and_unstack_tables(
                 table_specs, table_variables, num_table_shards
             ),
         )

keras_rs/src/layers/embedding/jax/embedding_lookup.py CHANGED Viewed

@@ -16,9 +16,30 @@ from jax_tpu_embedding.sparsecore.utils import utils as jte_utils
 from keras_rs.src.layers.embedding.jax import embedding_utils
 from keras_rs.src.types import Nested
-ShardedCooMatrix = embedding_utils.ShardedCooMatrix
-shard_map = jax.experimental.shard_map.shard_map  # type: ignore[attr-defined]
+if jax.__version_info__ >= (0, 8, 0):
+    from jax import shard_map
+else:
+    from jax.experimental.shard_map import shard_map as exp_shard_map
+    def shard_map(  # type: ignore[misc]
+        f: Any = None,
+        /,
+        *,
+        out_specs: Any,
+        in_specs: Any,
+        mesh: Any = None,
+        check_vma: bool = True,
+    ) -> Any:
+        return exp_shard_map(
+            f,
+            mesh=mesh,
+            in_specs=in_specs,
+            out_specs=out_specs,
+            check_rep=check_vma,
+        )  # type: ignore[no-untyped-call]
+ShardedCooMatrix = embedding_utils.ShardedCooMatrix
 ArrayLike: TypeAlias = jax.Array | np.ndarray[Any, Any]
 JaxLayout: TypeAlias = jax.sharding.NamedSharding | jax_layout.Format
@@ -121,7 +142,7 @@ def embedding_lookup(
             mesh=config.mesh,
             in_specs=(pd, pt),
             out_specs=pd,
-            check_rep=False,
+            check_vma=False,
         ),
     )
@@ -220,7 +241,7 @@ def embedding_lookup_bwd(
             mesh=config.mesh,
             in_specs=(pd, pd, pt, preplicate),
             out_specs=pt,
-            check_rep=False,
+            check_vma=False,
         ),
         #   in_shardings=(
         #       activation_layout,

keras-rs-nightly 0.2.2.dev202508190331__py3-none-any.whl → 0.3.1.dev202512130338__py3-none-any.whl

keras-rs-nightly 0.2.2.dev202508190331py3-none-any.whl → 0.3.1.dev202512130338py3-none-any.whl