PyPI - tf-nightly-cpu - Versions diffs - 2.20.0.dev20250220__cp310-cp310-win_amd64.whl → 2.20.0.dev20250222__cp310-cp310-win_amd64.whl - Mend

tf-nightly-cpu 2.20.0.dev20250220__cp310-cp310-win_amd64.whl → 2.20.0.dev20250222__cp310-cp310-win_amd64.whl

Files changed (128) hide show

tensorflow/_api/v2/compat/v1/summary/__init__.py CHANGED Viewed

@@ -5,8 +5,8 @@
 import sys as _sys
-from tensorflow.python.ops.summary_ops_v2 import all_v2_summary_ops # line: 661
-from tensorflow.python.ops.summary_ops_v2 import initialize # line: 473
+from tensorflow.python.ops.summary_ops_v2 import all_v2_summary_ops # line: 665
+from tensorflow.python.ops.summary_ops_v2 import initialize # line: 477
 from tensorflow.python.proto_exports import Event # line: 28
 from tensorflow.python.proto_exports import SessionLog # line: 47
 from tensorflow.python.proto_exports import Summary # line: 50

tensorflow/_api/v2/compat/v1/tpu/experimental/embedding/__init__.py CHANGED Viewed

@@ -19,8 +19,8 @@ from tensorflow.python.tpu.tpu_embedding_v2_utils import QuantizationConfig # li
 from tensorflow.python.tpu.tpu_embedding_v2_utils import RowIdInitializer # line: 1347
 from tensorflow.python.tpu.tpu_embedding_v2_utils import SGD # line: 363
 from tensorflow.python.tpu.tpu_embedding_v2_utils import TableConfig # line: 1161
-from tensorflow.python.tpu.tpu_embedding_v3 import SparseCoreEmbeddingConfig # line: 77
-from tensorflow.python.tpu.tpu_embedding_v3 import TPUEmbeddingV2 # line: 475
+from tensorflow.python.tpu.tpu_embedding_v3 import SparseCoreEmbeddingConfig # line: 78
+from tensorflow.python.tpu.tpu_embedding_v3 import TPUEmbeddingV2 # line: 482
 from tensorflow.python.util import module_wrapper as _module_wrapper

tensorflow/_api/v2/compat/v2/summary/__init__.py CHANGED Viewed

@@ -6,17 +6,17 @@
 import sys as _sys
 from tensorflow._api.v2.compat.v2.summary import experimental
-from tensorflow.python.ops.summary_ops_v2 import SummaryWriter # line: 244
-from tensorflow.python.ops.summary_ops_v2 import create_file_writer_v2 as create_file_writer # line: 516
-from tensorflow.python.ops.summary_ops_v2 import create_noop_writer # line: 641
-from tensorflow.python.ops.summary_ops_v2 import flush # line: 1141
-from tensorflow.python.ops.summary_ops_v2 import graph # line: 1053
-from tensorflow.python.ops.summary_ops_v2 import record_if # line: 157
+from tensorflow.python.ops.summary_ops_v2 import SummaryWriter # line: 248
+from tensorflow.python.ops.summary_ops_v2 import create_file_writer_v2 as create_file_writer # line: 520
+from tensorflow.python.ops.summary_ops_v2 import create_noop_writer # line: 645
+from tensorflow.python.ops.summary_ops_v2 import flush # line: 1145
+from tensorflow.python.ops.summary_ops_v2 import graph # line: 1057
+from tensorflow.python.ops.summary_ops_v2 import record_if # line: 161
 from tensorflow.python.ops.summary_ops_v2 import should_record_summaries # line: 133
-from tensorflow.python.ops.summary_ops_v2 import trace_export # line: 1390
-from tensorflow.python.ops.summary_ops_v2 import trace_off # line: 1443
-from tensorflow.python.ops.summary_ops_v2 import trace_on # line: 1334
-from tensorflow.python.ops.summary_ops_v2 import write # line: 737
+from tensorflow.python.ops.summary_ops_v2 import trace_export # line: 1394
+from tensorflow.python.ops.summary_ops_v2 import trace_off # line: 1447
+from tensorflow.python.ops.summary_ops_v2 import trace_on # line: 1338
+from tensorflow.python.ops.summary_ops_v2 import write # line: 741
 from tensorflow.python.summary.tb_summary import audio # line: 32
 from tensorflow.python.summary.tb_summary import histogram # line: 89
 from tensorflow.python.summary.tb_summary import image # line: 165

tensorflow/_api/v2/compat/v2/summary/experimental/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import sys as _sys
-from tensorflow.python.ops.summary_ops_v2 import get_step # line: 214
-from tensorflow.python.ops.summary_ops_v2 import set_step # line: 225
-from tensorflow.python.ops.summary_ops_v2 import summary_scope # line: 696
-from tensorflow.python.ops.summary_ops_v2 import write_raw_pb # line: 814
+from tensorflow.python.ops.summary_ops_v2 import get_step # line: 218
+from tensorflow.python.ops.summary_ops_v2 import set_step # line: 229
+from tensorflow.python.ops.summary_ops_v2 import summary_scope # line: 700
+from tensorflow.python.ops.summary_ops_v2 import write_raw_pb # line: 818

tensorflow/_api/v2/compat/v2/tpu/experimental/embedding/__init__.py CHANGED Viewed

@@ -19,5 +19,5 @@ from tensorflow.python.tpu.tpu_embedding_v2_utils import QuantizationConfig # li
 from tensorflow.python.tpu.tpu_embedding_v2_utils import RowIdInitializer # line: 1347
 from tensorflow.python.tpu.tpu_embedding_v2_utils import SGD # line: 363
 from tensorflow.python.tpu.tpu_embedding_v2_utils import TableConfig # line: 1161
-from tensorflow.python.tpu.tpu_embedding_v3 import SparseCoreEmbeddingConfig # line: 77
-from tensorflow.python.tpu.tpu_embedding_v3 import TPUEmbeddingV2 # line: 475
+from tensorflow.python.tpu.tpu_embedding_v3 import SparseCoreEmbeddingConfig # line: 78
+from tensorflow.python.tpu.tpu_embedding_v3 import TPUEmbeddingV2 # line: 482

tensorflow/_api/v2/summary/__init__.py CHANGED Viewed

@@ -6,17 +6,17 @@
 import sys as _sys
 from tensorflow._api.v2.summary import experimental
-from tensorflow.python.ops.summary_ops_v2 import SummaryWriter # line: 244
-from tensorflow.python.ops.summary_ops_v2 import create_file_writer_v2 as create_file_writer # line: 516
-from tensorflow.python.ops.summary_ops_v2 import create_noop_writer # line: 641
-from tensorflow.python.ops.summary_ops_v2 import flush # line: 1141
-from tensorflow.python.ops.summary_ops_v2 import graph # line: 1053
-from tensorflow.python.ops.summary_ops_v2 import record_if # line: 157
+from tensorflow.python.ops.summary_ops_v2 import SummaryWriter # line: 248
+from tensorflow.python.ops.summary_ops_v2 import create_file_writer_v2 as create_file_writer # line: 520
+from tensorflow.python.ops.summary_ops_v2 import create_noop_writer # line: 645
+from tensorflow.python.ops.summary_ops_v2 import flush # line: 1145
+from tensorflow.python.ops.summary_ops_v2 import graph # line: 1057
+from tensorflow.python.ops.summary_ops_v2 import record_if # line: 161
 from tensorflow.python.ops.summary_ops_v2 import should_record_summaries # line: 133
-from tensorflow.python.ops.summary_ops_v2 import trace_export # line: 1390
-from tensorflow.python.ops.summary_ops_v2 import trace_off # line: 1443
-from tensorflow.python.ops.summary_ops_v2 import trace_on # line: 1334
-from tensorflow.python.ops.summary_ops_v2 import write # line: 737
+from tensorflow.python.ops.summary_ops_v2 import trace_export # line: 1394
+from tensorflow.python.ops.summary_ops_v2 import trace_off # line: 1447
+from tensorflow.python.ops.summary_ops_v2 import trace_on # line: 1338
+from tensorflow.python.ops.summary_ops_v2 import write # line: 741
 from tensorflow.python.summary.tb_summary import audio # line: 32
 from tensorflow.python.summary.tb_summary import histogram # line: 89
 from tensorflow.python.summary.tb_summary import image # line: 165

tensorflow/_api/v2/summary/experimental/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import sys as _sys
-from tensorflow.python.ops.summary_ops_v2 import get_step # line: 214
-from tensorflow.python.ops.summary_ops_v2 import set_step # line: 225
-from tensorflow.python.ops.summary_ops_v2 import summary_scope # line: 696
-from tensorflow.python.ops.summary_ops_v2 import write_raw_pb # line: 814
+from tensorflow.python.ops.summary_ops_v2 import get_step # line: 218
+from tensorflow.python.ops.summary_ops_v2 import set_step # line: 229
+from tensorflow.python.ops.summary_ops_v2 import summary_scope # line: 700
+from tensorflow.python.ops.summary_ops_v2 import write_raw_pb # line: 818

tensorflow/_api/v2/tpu/experimental/embedding/__init__.py CHANGED Viewed

@@ -19,5 +19,5 @@ from tensorflow.python.tpu.tpu_embedding_v2_utils import QuantizationConfig # li
 from tensorflow.python.tpu.tpu_embedding_v2_utils import RowIdInitializer # line: 1347
 from tensorflow.python.tpu.tpu_embedding_v2_utils import SGD # line: 363
 from tensorflow.python.tpu.tpu_embedding_v2_utils import TableConfig # line: 1161
-from tensorflow.python.tpu.tpu_embedding_v3 import SparseCoreEmbeddingConfig # line: 77
-from tensorflow.python.tpu.tpu_embedding_v3 import TPUEmbeddingV2 # line: 475
+from tensorflow.python.tpu.tpu_embedding_v3 import SparseCoreEmbeddingConfig # line: 78
+from tensorflow.python.tpu.tpu_embedding_v3 import TPUEmbeddingV2 # line: 482

tensorflow/compiler/mlir/stablehlo/stablehlo_extension.pyd CHANGED Viewed

Binary file

tensorflow/compiler/tf2tensorrt/_pywrap_py_utils.pyd CHANGED Viewed

Binary file

tensorflow/compiler/tf2xla/ops/_xla_ops.so CHANGED Viewed

Binary file

tensorflow/include/external/llvm-project/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h CHANGED Viewed

@@ -71,6 +71,18 @@ public:
                                unsigned firstIndex) override;
 };
+/// Succeeds if an op can be converted to its unsigned equivalent without
+/// changing its semantics. This is the case when none of its openands or
+/// results can be below 0 when analyzed from a signed perspective.
+LogicalResult staticallyNonNegative(DataFlowSolver &solver, Operation *op);
+/// Succeeds when a value is statically non-negative in that it has a lower
+/// bound on its value (if it is treated as signed) and that bound is
+/// non-negative.
+/// Note, the results of this query may not be accurate for `index` if you plan
+/// to use a non-64-bit index.
+LogicalResult staticallyNonNegative(DataFlowSolver &solver, Value v);
 } // end namespace dataflow
 } // end namespace mlir

tensorflow/include/external/llvm-project/mlir/include/mlir/Dialect/Math/IR/MathOps.h.inc CHANGED Viewed

@@ -5711,6 +5711,7 @@ public:
   static void populateDefaultProperties(::mlir::OperationName opName, Properties &properties);
   ::llvm::LogicalResult verifyInvariantsImpl();
   ::llvm::LogicalResult verifyInvariants();
+  ::mlir::OpFoldResult fold(FoldAdaptor adaptor);
   static ::llvm::LogicalResult inferReturnTypes(::mlir::MLIRContext *context, ::std::optional<::mlir::Location> location, ::mlir::ValueRange operands, ::mlir::DictionaryAttr attributes, ::mlir::OpaqueProperties properties, ::mlir::RegionRange regions, ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes);
   static ::mlir::ParseResult parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result);
   void print(::mlir::OpAsmPrinter &_odsPrinter);
@@ -5925,6 +5926,7 @@ public:
   static void populateDefaultProperties(::mlir::OperationName opName, Properties &properties);
   ::llvm::LogicalResult verifyInvariantsImpl();
   ::llvm::LogicalResult verifyInvariants();
+  ::mlir::OpFoldResult fold(FoldAdaptor adaptor);
   static ::llvm::LogicalResult inferReturnTypes(::mlir::MLIRContext *context, ::std::optional<::mlir::Location> location, ::mlir::ValueRange operands, ::mlir::DictionaryAttr attributes, ::mlir::OpaqueProperties properties, ::mlir::RegionRange regions, ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes);
   static ::mlir::ParseResult parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result);
   void print(::mlir::OpAsmPrinter &_odsPrinter);
@@ -6139,6 +6141,7 @@ public:
   static void populateDefaultProperties(::mlir::OperationName opName, Properties &properties);
   ::llvm::LogicalResult verifyInvariantsImpl();
   ::llvm::LogicalResult verifyInvariants();
+  ::mlir::OpFoldResult fold(FoldAdaptor adaptor);
   static ::llvm::LogicalResult inferReturnTypes(::mlir::MLIRContext *context, ::std::optional<::mlir::Location> location, ::mlir::ValueRange operands, ::mlir::DictionaryAttr attributes, ::mlir::OpaqueProperties properties, ::mlir::RegionRange regions, ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes);
   static ::mlir::ParseResult parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result);
   void print(::mlir::OpAsmPrinter &_odsPrinter);
@@ -6353,6 +6356,7 @@ public:
   static void populateDefaultProperties(::mlir::OperationName opName, Properties &properties);
   ::llvm::LogicalResult verifyInvariantsImpl();
   ::llvm::LogicalResult verifyInvariants();
+  ::mlir::OpFoldResult fold(FoldAdaptor adaptor);
   static ::llvm::LogicalResult inferReturnTypes(::mlir::MLIRContext *context, ::std::optional<::mlir::Location> location, ::mlir::ValueRange operands, ::mlir::DictionaryAttr attributes, ::mlir::OpaqueProperties properties, ::mlir::RegionRange regions, ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes);
   static ::mlir::ParseResult parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result);
   void print(::mlir::OpAsmPrinter &_odsPrinter);

tensorflow/include/external/shardy/shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.h CHANGED Viewed

@@ -81,6 +81,15 @@ class AggressiveFactorPropagation : public BasicFactorPropagation {
       PropagationDirectionAlongFactor directionAlongFactor,
       ArrayRef<int64_t> factorSizes, MeshAttr mesh, Operation* op,
       bool conservativePropagation) const override;
+ private:
+  // Returns the axes to propagate to an individual factor in the given
+  // `tensorFactorShardings` of a tensor.
+  SmallVector<AxisRefAttr> getPropagatedFactorSharding(
+      int64_t factorIndex, const TensorFactorShardings& tensorFactorShardings,
+      const FactorIndexToSharding& factorIndexToSharding,
+      AxesPerFactorRef axesPerFactor, MeshAttr mesh,
+      bool conservativePropagation, ArrayRef<int64_t> factorSizes) const;
 };
 }  // namespace sdy

tensorflow/include/external/stablehlo/_virtual_includes/stablehlo_pass_utils/stablehlo/transforms/PassUtils.h CHANGED Viewed

@@ -69,6 +69,13 @@ Value getConstantLike(OpBuilder &b, Location loc, const APFloat &constant,
 // Check if any of the given types are mlir::quant::QuantizedType.
 bool isAnyQuantizedTypes(TypeRange types);
+// Creates a quantized element type based on the given parameters.
+Type getQuantizedElementType(Location loc, Type storageType, Type expressedType,
+                             ArrayRef<double> scales,
+                             ArrayRef<int64_t> zeroPoints,
+                             int32_t quantizedDimension, int64_t storageTypeMin,
+                             int64_t storageTypeMax);
 }  // namespace stablehlo
 }  // namespace mlir

tensorflow/include/external/stablehlo/_virtual_includes/stablehlo_passes/stablehlo/transforms/PassUtils.h CHANGED Viewed

@@ -69,6 +69,13 @@ Value getConstantLike(OpBuilder &b, Location loc, const APFloat &constant,
 // Check if any of the given types are mlir::quant::QuantizedType.
 bool isAnyQuantizedTypes(TypeRange types);
+// Creates a quantized element type based on the given parameters.
+Type getQuantizedElementType(Location loc, Type storageType, Type expressedType,
+                             ArrayRef<double> scales,
+                             ArrayRef<int64_t> zeroPoints,
+                             int32_t quantizedDimension, int64_t storageTypeMin,
+                             int64_t storageTypeMax);
 }  // namespace stablehlo
 }  // namespace mlir

tensorflow/include/external/stablehlo/_virtual_includes/version/stablehlo/dialect/Version.h CHANGED Viewed

@@ -38,7 +38,7 @@ class Version {
   static FailureOr<Version> fromString(llvm::StringRef versionRef);
   /// Return a Version representing the current VHLO dialect version.
-  static Version getCurrentVersion() { return Version(1, 9, 2); }
+  static Version getCurrentVersion() { return Version(1, 9, 3); }
   /// Return a Version representing the minimum supported VHLO dialect version.
   static Version getMinimumVersion() { return Version(0, 9, 0); }

tensorflow/include/external/stablehlo/stablehlo/dialect/Version.h CHANGED Viewed

@@ -38,7 +38,7 @@ class Version {
   static FailureOr<Version> fromString(llvm::StringRef versionRef);
   /// Return a Version representing the current VHLO dialect version.
-  static Version getCurrentVersion() { return Version(1, 9, 2); }
+  static Version getCurrentVersion() { return Version(1, 9, 3); }
   /// Return a Version representing the minimum supported VHLO dialect version.
   static Version getMinimumVersion() { return Version(0, 9, 0); }

tensorflow/include/external/stablehlo/stablehlo/transforms/PassUtils.h CHANGED Viewed

@@ -69,6 +69,13 @@ Value getConstantLike(OpBuilder &b, Location loc, const APFloat &constant,
 // Check if any of the given types are mlir::quant::QuantizedType.
 bool isAnyQuantizedTypes(TypeRange types);
+// Creates a quantized element type based on the given parameters.
+Type getQuantizedElementType(Location loc, Type storageType, Type expressedType,
+                             ArrayRef<double> scales,
+                             ArrayRef<int64_t> zeroPoints,
+                             int32_t quantizedDimension, int64_t storageTypeMin,
+                             int64_t storageTypeMax);
 }  // namespace stablehlo
 }  // namespace mlir

tensorflow/include/tensorflow/compiler/xla/backends/cpu/codegen/kernel_api_ir_builder.h CHANGED Viewed

@@ -89,9 +89,10 @@ class KernelApiIrBuilder {
     // read-only if it is not aliased with any result.
     absl::flat_hash_set<int64_t> invariant_arguments;
-    // the set of buffer uses for this kernel, can be empty if buffer
+    // The set of buffers used by this kernel, can be empty if buffer assignment
     // was not provided.
-    absl::InlinedVector<BufferUse, 8> buffer_uses;
+    absl::InlinedVector<BufferAllocation::Slice, 8> argument_buffers;
+    absl::InlinedVector<BufferAllocation::Slice, 8> result_buffers;
   };
   KernelApiIrBuilder(llvm::LLVMContext& context, Options options);

tensorflow/include/tensorflow/compiler/xla/backends/cpu/runtime/convolution_thunk_internal.h CHANGED Viewed

@@ -22,7 +22,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
-#include "xla/backends/cpu/runtime/concurrency.h"
+#include "xla/backends/cpu/runtime/work_queue.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"  // IWYU pragma: keep
@@ -30,7 +30,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include "Eigen/Core"
-#include "Eigen/ThreadPool"
 #include "unsupported/Eigen/CXX11/Tensor"
 namespace xla::cpu::internal {
@@ -384,8 +383,9 @@ void EigenGenericConv2D(
     auto num_tasks = Eigen::numext::div_ceil(feature_group_count, task_size);
     if (use_thunk_runtime) {
-      ScheduleAll(
-          &device, num_tasks, [=, &device](Eigen::Index task_index) mutable {
+      Worker::Parallelize(
+          &device, /*num_workers=*/num_tasks, num_tasks,
+          [=, &device](Eigen::Index task_index) mutable {
             Eigen::Index start = task_index * task_size;
             Eigen::Index end = std::min(start + task_size, feature_group_count);
             for (Eigen::Index i = start; i < end; ++i) {
@@ -395,18 +395,16 @@ void EigenGenericConv2D(
             }
           });
     } else {
-      Eigen::Barrier barrier(num_tasks);
-      ScheduleAll(
-          &device, num_tasks, [=, &device, &barrier](Eigen::Index task_index) {
+      tsl::BlockUntilReady(Worker::Parallelize(
+          &device, /*num_workers=*/num_tasks, num_tasks,
+          [=, &device](Eigen::Index task_index) {
             Eigen::Index start = task_index * task_size;
             Eigen::Index end = std::min(start + task_size, feature_group_count);
             for (Eigen::Index i = start; i < end; ++i) {
               auto [output, convolved] = convolve_group(i);
               output.device(device) = convolved;
             }
-            barrier.Notify();
-          });
-      barrier.Wait();
+          }));
     }
   } else {

tensorflow/include/tensorflow/compiler/xla/backends/cpu/runtime/kernel_thunk.h CHANGED Viewed

@@ -63,6 +63,8 @@ class KernelThunkBase : public Thunk {
       const = 0;
   virtual absl::Span<const BufferAllocation::Slice> results_buffers() const = 0;
+  virtual const absl::flat_hash_set<int64_t>& invariant_arguments() const = 0;
 };
 namespace internal {
@@ -95,6 +97,10 @@ class KernelThunk : public KernelThunkBase {
     return absl::MakeSpan(results_buffers_);
   }
+  const absl::flat_hash_set<int64_t>& invariant_arguments() const final {
+    return invariant_arguments_;
+  }
  protected:
   tsl::AsyncValueRef<ExecuteEvent> ExecuteInternal(const ExecuteParams& params);
@@ -129,7 +135,7 @@ class KernelThunk : public KernelThunkBase {
   KernelThunk(Info info,
               absl::Span<const BufferAllocation::Slice> arguments_buffers,
               absl::Span<const BufferAllocation::Slice> results_buffers,
-              std::optional<absl::flat_hash_set<int64_t>> invariant_arguments,
+              absl::flat_hash_set<int64_t> invariant_arguments,
               std::string kernel_name, se::ThreadDim thread_dim,
               std::optional<uint64_t> min_alignment);
@@ -139,7 +145,7 @@ class KernelThunk : public KernelThunkBase {
   ResultsBuffers results_buffers_;
   // A set of invariant arguments (their indices).
-  std::optional<absl::flat_hash_set<int64_t>> invariant_arguments_;
+  absl::flat_hash_set<int64_t> invariant_arguments_;
   size_t num_kernel_args_;
@@ -189,7 +195,7 @@ class KernelThunk final : public internal::KernelThunk<> {
       absl::Span<const BufferAllocation::Slice> arguments_buffers,
       absl::Span<const BufferAllocation::Slice> results_buffers,
       std::string kernel_name, se::ThreadDim thread_dim,
-      std::optional<absl::flat_hash_set<int64_t>> invariant_arguments,
+      absl::flat_hash_set<int64_t> invariant_arguments,
       std::optional<uint64_t> min_alignment = std::nullopt);
   static absl::StatusOr<std::unique_ptr<Thunk>> Create(

tensorflow/include/tensorflow/compiler/xla/backends/cpu/runtime/work_queue.h CHANGED Viewed

@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/optimization.h"
 #include "absl/container/fixed_array.h"
-#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -44,15 +43,6 @@ namespace xla::cpu {
 // A work queue that partitions `num_tasks` tasks into `num_partitions`
 // partitions processed by parallel workers.
 class WorkQueue {
-  // Align all atomic counters to a cache line boundary to avoid false
-  // sharing between multiple worker threads.
-  static constexpr size_t kAtomicAlignment =
-#if defined(__cpp_lib_hardware_interference_size)
-      std::hardware_destructive_interference_size;
-#else
-      64;
-#endif
  public:
   WorkQueue(size_t num_tasks, size_t num_partitions);
@@ -60,13 +50,23 @@ class WorkQueue {
   // if the partition is complete.
   std::optional<size_t> Pop(size_t partition_index);
-  size_t num_partitions() const { return partitions_.size(); }
+  // Return the partition [begin, end) task range.
+  std::pair<size_t, size_t> partition_range(size_t partition_index) const;
-  bool empty() const { return empty_.load(std::memory_order_relaxed); }
+  size_t num_partitions() const { return partitions_.size(); }
  private:
   friend class Worker;
+  // Align all atomic counters to a cache line boundary to avoid false
+  // sharing between multiple worker threads.
+  static constexpr size_t kAtomicAlignment =
+#if defined(__cpp_lib_hardware_interference_size)
+      std::hardware_destructive_interference_size;
+#else
+      64;
+#endif
   struct Partition {
     void Initialize(size_t begin, size_t end);
@@ -76,8 +76,21 @@ class WorkQueue {
     size_t end;
   };
+  // An empty work queue flag to stop worker threads from looping through all
+  // partitions looking for work.
+  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+  void SetEmpty() { empty_.store(true, std::memory_order_relaxed); }
+  // Notify that one of the workers switched to the work stealing mode.
+  void NotifyWorkStealingWorker();
+  // Decrements the number of work stealing workers by at most `max_workers` and
+  // returns the number of decremented work stealing workers.
+  size_t DecrementWorkStealingWorkers(size_t max_workers);
   absl::FixedArray<Partition, 32> partitions_;
   alignas(kAtomicAlignment) std::atomic<bool> empty_;
+  alignas(kAtomicAlignment) std::atomic<size_t> num_work_stealing_workers_;
 };
 // Worker processes tasks from the work queue starting from the assigned
@@ -130,10 +143,14 @@ inline void WorkQueue::Partition::Initialize(size_t begin, size_t end) {
 }
 inline WorkQueue::WorkQueue(size_t num_tasks, size_t num_partitions)
-    : partitions_(num_partitions), empty_(num_tasks == 0) {
-  size_t partition_size = tsl::MathUtil::CeilOfRatio(num_tasks, num_partitions);
-  for (size_t i = 0, begin = 0, end = partition_size; i < num_partitions;
-       ++i, begin = end, end = std::min(num_tasks, end + partition_size)) {
+    : partitions_(num_partitions),
+      empty_(num_tasks == 0),
+      num_work_stealing_workers_(0) {
+  size_t partition_size =
+      tsl::MathUtil::FloorOfRatio(num_tasks, num_partitions);
+  size_t rem_tasks = num_tasks % num_partitions;
+  for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
+    end = begin + partition_size + ((i < rem_tasks) ? 1 : 0);
     partitions_[i].Initialize(begin, end);
   }
 }
@@ -154,6 +171,29 @@ inline std::optional<size_t> WorkQueue::Pop(size_t partition_index) {
                                                     : std::make_optional(index);
 }
+inline std::pair<size_t, size_t> WorkQueue::partition_range(
+    size_t partition_index) const {
+  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
+  return {partitions_[partition_index].begin, partitions_[partition_index].end};
+}
+inline void WorkQueue::NotifyWorkStealingWorker() {
+  num_work_stealing_workers_.fetch_add(1, std::memory_order_relaxed);
+}
+inline size_t WorkQueue::DecrementWorkStealingWorkers(size_t max_workers) {
+  size_t n = num_work_stealing_workers_.load(std::memory_order_relaxed);
+  size_t decrement = std::min(n, max_workers);
+  while (decrement && !num_work_stealing_workers_.compare_exchange_weak(
+                          n, n - decrement, std::memory_order_relaxed,
+                          std::memory_order_relaxed)) {
+    decrement = std::min(n, max_workers);
+  }
+  return decrement;
+}
 inline Worker::Worker(size_t worker_index, WorkQueue* queue)
     : worker_index_(worker_index),
       partition_index_(worker_index),
@@ -163,7 +203,13 @@ inline std::optional<size_t> Worker::Pop() {
   std::optional<size_t> task = queue_->Pop(partition_index_);
   if (ABSL_PREDICT_TRUE(task)) return task;
-  while (!task.has_value() && !queue_->empty()) {
+  // If we didn't find a task in the initially assigned partition, notify the
+  // work queue that we are switching to work stealing mode.
+  if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
+    queue_->NotifyWorkStealingWorker();
+  }
+  while (!task.has_value() && !queue_->IsEmpty()) {
     // Wrap around to the first partition.
     if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
       partition_index_ = 0;
@@ -171,7 +217,7 @@ inline std::optional<size_t> Worker::Pop() {
     // We checked all partitions and got back to the partition we started from.
     if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
-      queue_->empty_.store(true, std::memory_order_relaxed);
+      queue_->SetEmpty();
       break;
     }
@@ -205,6 +251,7 @@ Worker::ParallelizeContext<ParallelTask>::ParallelizeContext(
       parallel_task(std::forward<ParallelTask>(parallel_task)) {}
 template <typename ParallelTask>
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
 void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
                                     uint16_t start_index, uint16_t end_index) {
   DCHECK_LT(start_index, end_index) << "Invalid worker index range";
@@ -223,11 +270,26 @@ void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
   while (end_index - start_index > 1) {
     // If work queue is empty, we don't need to keep enqueuing more workers and
     // can simply count down for the remaining workers.
-    if (ABSL_PREDICT_FALSE(ctx->work_queue.empty())) {
+    if (ABSL_PREDICT_FALSE(ctx->work_queue.IsEmpty())) {
       count_down(end_index - start_index, absl::OkStatus());
       return;
     }
+    // If we have workers in the work stealing mode, we can skip enqueuing
+    // more tasks as existing workers will process remaining partitions. By
+    // doing this optimization we avoid unnecessary thread pool overheads.
+    size_t skip_workers =
+        ctx->work_queue.DecrementWorkStealingWorkers(end_index - start_index);
+    if (ABSL_PREDICT_FALSE(skip_workers > 0)) {
+      DCHECK_LE(skip_workers, end_index - start_index);
+      count_down(skip_workers, absl::OkStatus());
+      end_index -= skip_workers;
+      if (start_index == end_index) return;
+      if (end_index - start_index == 1) break;
+    }
+    DCHECK_GE(end_index - start_index, 1);
     uint16_t mid_index = (start_index + end_index) / 2;
     ctx->device->enqueueNoNotification([ctx, mid_index, end_index] {
       ParallelizeWithContext(ctx, mid_index, end_index);

tensorflow/include/tensorflow/compiler/xla/codegen/kernel_spec.h CHANGED Viewed

@@ -17,12 +17,14 @@ limitations under the License.
 #define XLA_CODEGEN_KERNEL_SPEC_H_
 #include <cstddef>
+#include <cstdint>
 #include <optional>
 #include <string>
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
-#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/launch_dim.h"
 namespace xla {
@@ -33,15 +35,17 @@ namespace xla {
 // will load kernel PTX on device and instantiate a KernelThunk.
 class KernelSpec {
  public:
-  using BufferUses = absl::InlinedVector<BufferUse, 8>;
+  using Buffers = absl::InlinedVector<BufferAllocation::Slice, 8>;
   KernelSpec(absl::string_view name, se::ThreadDim thread_dim,
-             BufferUses buffer_uses,
+             Buffers argument_buffers, Buffers result_buffers,
+             absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
   KernelSpec(absl::string_view name, se::ClusterDim cluster_dim,
              se::BlockDim block_dim, se::ThreadDim thread_dim,
-             BufferUses buffer_uses,
+             Buffers argument_buffers, Buffers result_buffers,
+             absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
   // Get the backend specific name of the kernel.
@@ -67,15 +71,28 @@ class KernelSpec {
   // managed buffer that is likely to be in L1/L2 cache).
   std::optional<size_t> scratch_bytes() const { return scratch_bytes_; }
-  // Buffers (buffer allocation slices) used by the kernel.
-  const BufferUses& buffer_uses() const { return buffer_uses_; }
+  // Argument buffers read by the kernel.
+  const Buffers& argument_buffers() const { return argument_buffers_; }
+  // Result buffers written to by the kernel.
+  const Buffers& result_buffers() const { return result_buffers_; }
+  // Returns a set of invariant arguments (corresponding to the indices in the
+  // argument buffers list).
+  const absl::flat_hash_set<int64_t>& invariant_arguments() const {
+    return invariant_arguments_;
+  }
  private:
   std::string name_;
   se::ClusterDim cluster_dim_;
   se::BlockDim block_dim_;
   se::ThreadDim thread_dim_;
-  BufferUses buffer_uses_;
+  Buffers argument_buffers_;
+  Buffers result_buffers_;
+  absl::flat_hash_set<int64_t> invariant_arguments_;
   std::optional<size_t> scratch_bytes_;
 };