PyPI - tf-nightly-cpu - Versions diffs - 2.20.0.dev20250220__cp310-cp310-win_amd64.whl → 2.20.0.dev20250221__cp310-cp310-win_amd64.whl - Mend

tf-nightly-cpu 2.20.0.dev20250220__cp310-cp310-win_amd64.whl → 2.20.0.dev20250221__cp310-cp310-win_amd64.whl

Files changed (113) hide show

tensorflow/include/tensorflow/core/public/version.h CHANGED Viewed

@@ -1,127 +1,112 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PUBLIC_VERSION_H_
-#define TENSORFLOW_CORE_PUBLIC_VERSION_H_
-// TensorFlow uses semantic versioning, see http://semver.org/.
-// Also update tensorflow/tensorflow.bzl and
-// tensorflow/tools/pip_package/setup.py
-#define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 20
-#define TF_PATCH_VERSION 0
-// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
-// "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-dev20250220"
-#define TF_STR_HELPER(x) #x
-#define TF_STR(x) TF_STR_HELPER(x)
-// e.g. "0.5.0" or "0.6.0-alpha".
-#define TF_VERSION_STRING                                            \
-  (TF_STR(TF_MAJOR_VERSION) "." TF_STR(TF_MINOR_VERSION) "." TF_STR( \
-      TF_PATCH_VERSION) TF_VERSION_SUFFIX)
-// GraphDef compatibility versions (the versions field in graph.proto).
-//
-// Each graph has producer and min_consumer versions, and each
-// consumer has its own version and a min_producer.  In addition, graphs can
-// mark specific consumer versions as bad (to prevent bugs from executing).
-// A consumer will execute a graph if the consumer's version is at least the
-// graph's min_consumer, the graph's producer version is at least the consumer's
-// min_producer, and the consumer version isn't specifically disallowed by the
-// graph.
-//
-// By default, newly created graphs have producer version TF_GRAPH_DEF_VERSION
-// min_consumer TF_GRAPH_DEF_MIN_CONSUMER, and no other bad consumer versions.
-//
-// Version history:
-//
-// 0. Graphs created before GraphDef versioning
-// 1. First real version (2dec2015)
-// 2. adjust_contrast only takes float, doesn't perform clamping (11dec2015)
-// 3. Remove TileGrad, since it was equivalent to reduce_sum (30dec2015)
-// 4. When support for this version is removed, we can safely make AttrValue
-//    parsing more strict with respect to empty list values (see
-//    111635679, 7jan2016).
-// 5. Graphs are wholly-validated during Session::Create() (7jan2016).
-// 6. TensorFlow is scalar strict within Google (27jan2016).
-// 7. Remove TopK in favor of TopKV2 (5feb2016).
-// 8. Replace RandomCrop from C++ with pure Python (5feb2016).
-// 9. Deprecate batch_norm_with_global_normalization (16feb2016).
-// 10. Deprecate conv3d_backprop_{filter,input} (10jun2016).
-// 11. Deprecate {batch}_self_adjoint_eig (3aug2016).
-// 12. Graph consumers understand the node_def field of FunctionDef (22aug2016).
-// 13. Deprecate multiple batch linear algebra ops (9sep2016).
-// 14. Deprecate batch_matrix_* ops. (10sep2016).
-// 15. Deprecate batch_fft_* ops. (14sep2016).
-// 16. Deprecate tensor_array (v1) ops in favor of v2 (10nov2016).
-// 17. Deprecate inv (11nov2016).
-// 17. Expose reverse_v2 (10nov2016)
-// 18. Add VariableV2 (30nov2016)
-// 19. Deprecated ops created by models moved out of core SkipGram, NegTrain.
-//     (08dec2016)
-// 20. Catch all version 1.0 changes to Python API generation. SplitV is now
-//     used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is
-//     now used by tf.concat. Graphs use flooring
-//     division and mod semantics. TensorArrayV3. (12dec2016)
-//     Also considered the version for when it is required for reduction
-//     ops' indices to be scalar or vector, and not higher rank.
-//     Some earlier graph def versions allowed this.
-// 21. Dropped FunctionDef.Node support, switched to node_def introduced
-//     in version 12. (11jan2017)
-// 22. Placeholder now can specify and enforce scalar and partial
-//     shapes, particularly when restoring a graph from GraphDef
-//     produced at version 22 or later.  (04/10/2016)
-// 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
-// 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
-// 25. Deprecate stack (v1) ops in favor of v2 (2017/6/15).
-// 25. Deprecate RandomPoisson (v1) ops in favor of v2 (2017/10/25).
-// 26. Add a bool 'stripped_default_attrs' to MetaInfoDef indicating
-//     whether default-valued attrs have been stripped from the nodes in the
-//     GraphDef. (7dec2017)
-// 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops
-//     deprecated in favor of V2 ops. (2018/01/23)
-// 28. Deprecate MatrixExponential op in favor of Python implementation.
-//     (2018/08/21).
-// (2019/02/15). Added `control_ret` field to FunctionDef proto, and
-//     `control_output` field to OpDef proto.
-// 29. Deprecate StatefulStandardNormal op in favor of StatefulStandardNormalV2.
-//     (2019/03/25).
-// (2019/04/17). Added `arg_attr` field to FunctionDefProto.
-// 30. (2019/05/09) First date based GraphDef version. GraphDef
-//     versions advance by 1 each day after this point.
-#define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
-#define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2143  // Updated: 2025/2/19
-// Checkpoint compatibility versions (the versions field in SavedSliceMeta).
-//
-// The checkpoint versions have the same semantics as GraphDef versions, but the
-// numbering scheme is separate.  We have no plans to ever deprecate checkpoint
-// versions, but it's good to have this in place in case we ever need to.
-//
-// Version history:
-//
-// 0. Checkpoints saved before checkpoint versioning.
-// 1. First real version (10feb2015).
-#define TF_CHECKPOINT_VERSION_MIN_PRODUCER 0
-#define TF_CHECKPOINT_VERSION_MIN_CONSUMER 0
-#define TF_CHECKPOINT_VERSION 1
-#endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PUBLIC_VERSION_H_
+#define TENSORFLOW_CORE_PUBLIC_VERSION_H_
+// TensorFlow uses semantic versioning, see http://semver.org/.
+#define TF_STR_HELPER(x) #x
+#define TF_STR(x) TF_STR_HELPER(x)
+// GraphDef compatibility versions (the versions field in graph.proto).
+//
+// Each graph has producer and min_consumer versions, and each
+// consumer has its own version and a min_producer.  In addition, graphs can
+// mark specific consumer versions as bad (to prevent bugs from executing).
+// A consumer will execute a graph if the consumer's version is at least the
+// graph's min_consumer, the graph's producer version is at least the consumer's
+// min_producer, and the consumer version isn't specifically disallowed by the
+// graph.
+//
+// By default, newly created graphs have producer version TF_GRAPH_DEF_VERSION
+// min_consumer TF_GRAPH_DEF_MIN_CONSUMER, and no other bad consumer versions.
+//
+// Version history:
+//
+// 0. Graphs created before GraphDef versioning
+// 1. First real version (2dec2015)
+// 2. adjust_contrast only takes float, doesn't perform clamping (11dec2015)
+// 3. Remove TileGrad, since it was equivalent to reduce_sum (30dec2015)
+// 4. When support for this version is removed, we can safely make AttrValue
+//    parsing more strict with respect to empty list values (see
+//    111635679, 7jan2016).
+// 5. Graphs are wholly-validated during Session::Create() (7jan2016).
+// 6. TensorFlow is scalar strict within Google (27jan2016).
+// 7. Remove TopK in favor of TopKV2 (5feb2016).
+// 8. Replace RandomCrop from C++ with pure Python (5feb2016).
+// 9. Deprecate batch_norm_with_global_normalization (16feb2016).
+// 10. Deprecate conv3d_backprop_{filter,input} (10jun2016).
+// 11. Deprecate {batch}_self_adjoint_eig (3aug2016).
+// 12. Graph consumers understand the node_def field of FunctionDef (22aug2016).
+// 13. Deprecate multiple batch linear algebra ops (9sep2016).
+// 14. Deprecate batch_matrix_* ops. (10sep2016).
+// 15. Deprecate batch_fft_* ops. (14sep2016).
+// 16. Deprecate tensor_array (v1) ops in favor of v2 (10nov2016).
+// 17. Deprecate inv (11nov2016).
+// 17. Expose reverse_v2 (10nov2016)
+// 18. Add VariableV2 (30nov2016)
+// 19. Deprecated ops created by models moved out of core SkipGram, NegTrain.
+//     (08dec2016)
+// 20. Catch all version 1.0 changes to Python API generation. SplitV is now
+//     used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is
+//     now used by tf.concat. Graphs use flooring
+//     division and mod semantics. TensorArrayV3. (12dec2016)
+//     Also considered the version for when it is required for reduction
+//     ops' indices to be scalar or vector, and not higher rank.
+//     Some earlier graph def versions allowed this.
+// 21. Dropped FunctionDef.Node support, switched to node_def introduced
+//     in version 12. (11jan2017)
+// 22. Placeholder now can specify and enforce scalar and partial
+//     shapes, particularly when restoring a graph from GraphDef
+//     produced at version 22 or later.  (04/10/2016)
+// 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
+// 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
+// 25. Deprecate stack (v1) ops in favor of v2 (2017/6/15).
+// 25. Deprecate RandomPoisson (v1) ops in favor of v2 (2017/10/25).
+// 26. Add a bool 'stripped_default_attrs' to MetaInfoDef indicating
+//     whether default-valued attrs have been stripped from the nodes in the
+//     GraphDef. (7dec2017)
+// 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops
+//     deprecated in favor of V2 ops. (2018/01/23)
+// 28. Deprecate MatrixExponential op in favor of Python implementation.
+//     (2018/08/21).
+// (2019/02/15). Added `control_ret` field to FunctionDef proto, and
+//     `control_output` field to OpDef proto.
+// 29. Deprecate StatefulStandardNormal op in favor of StatefulStandardNormalV2.
+//     (2019/03/25).
+// (2019/04/17). Added `arg_attr` field to FunctionDefProto.
+// 30. (2019/05/09) First date based GraphDef version. GraphDef
+//     versions advance by 1 each day after this point.
+#define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
+#define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
+#define TF_GRAPH_DEF_VERSION 2144  // Updated: 2025/2/20
+// Checkpoint compatibility versions (the versions field in SavedSliceMeta).
+//
+// The checkpoint versions have the same semantics as GraphDef versions, but the
+// numbering scheme is separate.  We have no plans to ever deprecate checkpoint
+// versions, but it's good to have this in place in case we ever need to.
+//
+// Version history:
+//
+// 0. Checkpoints saved before checkpoint versioning.
+// 1. First real version (10feb2015).
+#define TF_CHECKPOINT_VERSION_MIN_PRODUCER 0
+#define TF_CHECKPOINT_VERSION_MIN_CONSUMER 0
+#define TF_CHECKPOINT_VERSION 1
+#endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_

tensorflow/include/tensorflow/python/eager/pywrap_tfe.h CHANGED Viewed

@@ -443,7 +443,7 @@ EagerContextThreadLocalData* GetEagerContextThreadLocalData(
 // wish to destroy thread-local state associated with a single py_eager_context
 // for multiple threads, then you must call this method from each thread.
 //
-// Thread-local state assocaited with eager contexts is also automatically
+// Thread-local state associated with eager contexts is also automatically
 // cleaned up when the thread is destroyed.
 //
 // This function assumes that the Python GIL is held (and does not perform its

tensorflow/include/xla/backends/cpu/codegen/kernel_api_ir_builder.h CHANGED Viewed

@@ -89,9 +89,10 @@ class KernelApiIrBuilder {
     // read-only if it is not aliased with any result.
     absl::flat_hash_set<int64_t> invariant_arguments;
-    // the set of buffer uses for this kernel, can be empty if buffer
+    // The set of buffers used by this kernel, can be empty if buffer assignment
     // was not provided.
-    absl::InlinedVector<BufferUse, 8> buffer_uses;
+    absl::InlinedVector<BufferAllocation::Slice, 8> argument_buffers;
+    absl::InlinedVector<BufferAllocation::Slice, 8> result_buffers;
   };
   KernelApiIrBuilder(llvm::LLVMContext& context, Options options);

tensorflow/include/xla/backends/cpu/runtime/kernel_thunk.h CHANGED Viewed

@@ -63,6 +63,8 @@ class KernelThunkBase : public Thunk {
       const = 0;
   virtual absl::Span<const BufferAllocation::Slice> results_buffers() const = 0;
+  virtual const absl::flat_hash_set<int64_t>& invariant_arguments() const = 0;
 };
 namespace internal {
@@ -95,6 +97,10 @@ class KernelThunk : public KernelThunkBase {
     return absl::MakeSpan(results_buffers_);
   }
+  const absl::flat_hash_set<int64_t>& invariant_arguments() const final {
+    return invariant_arguments_;
+  }
  protected:
   tsl::AsyncValueRef<ExecuteEvent> ExecuteInternal(const ExecuteParams& params);
@@ -129,7 +135,7 @@ class KernelThunk : public KernelThunkBase {
   KernelThunk(Info info,
               absl::Span<const BufferAllocation::Slice> arguments_buffers,
               absl::Span<const BufferAllocation::Slice> results_buffers,
-              std::optional<absl::flat_hash_set<int64_t>> invariant_arguments,
+              absl::flat_hash_set<int64_t> invariant_arguments,
               std::string kernel_name, se::ThreadDim thread_dim,
               std::optional<uint64_t> min_alignment);
@@ -139,7 +145,7 @@ class KernelThunk : public KernelThunkBase {
   ResultsBuffers results_buffers_;
   // A set of invariant arguments (their indices).
-  std::optional<absl::flat_hash_set<int64_t>> invariant_arguments_;
+  absl::flat_hash_set<int64_t> invariant_arguments_;
   size_t num_kernel_args_;
@@ -189,7 +195,7 @@ class KernelThunk final : public internal::KernelThunk<> {
       absl::Span<const BufferAllocation::Slice> arguments_buffers,
       absl::Span<const BufferAllocation::Slice> results_buffers,
       std::string kernel_name, se::ThreadDim thread_dim,
-      std::optional<absl::flat_hash_set<int64_t>> invariant_arguments,
+      absl::flat_hash_set<int64_t> invariant_arguments,
       std::optional<uint64_t> min_alignment = std::nullopt);
   static absl::StatusOr<std::unique_ptr<Thunk>> Create(

tensorflow/include/xla/backends/cpu/runtime/work_queue.h CHANGED Viewed

@@ -44,15 +44,6 @@ namespace xla::cpu {
 // A work queue that partitions `num_tasks` tasks into `num_partitions`
 // partitions processed by parallel workers.
 class WorkQueue {
-  // Align all atomic counters to a cache line boundary to avoid false
-  // sharing between multiple worker threads.
-  static constexpr size_t kAtomicAlignment =
-#if defined(__cpp_lib_hardware_interference_size)
-      std::hardware_destructive_interference_size;
-#else
-      64;
-#endif
  public:
   WorkQueue(size_t num_tasks, size_t num_partitions);
@@ -60,13 +51,23 @@ class WorkQueue {
   // if the partition is complete.
   std::optional<size_t> Pop(size_t partition_index);
-  size_t num_partitions() const { return partitions_.size(); }
+  // Return the partition [begin, end) task range.
+  std::pair<size_t, size_t> partition_range(size_t partition_index) const;
-  bool empty() const { return empty_.load(std::memory_order_relaxed); }
+  size_t num_partitions() const { return partitions_.size(); }
  private:
   friend class Worker;
+  // Align all atomic counters to a cache line boundary to avoid false
+  // sharing between multiple worker threads.
+  static constexpr size_t kAtomicAlignment =
+#if defined(__cpp_lib_hardware_interference_size)
+      std::hardware_destructive_interference_size;
+#else
+      64;
+#endif
   struct Partition {
     void Initialize(size_t begin, size_t end);
@@ -76,8 +77,21 @@ class WorkQueue {
     size_t end;
   };
+  // An empty work queue flag to stop worker threads from looping through all
+  // partitions looking for work.
+  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+  void SetEmpty() { empty_.store(true, std::memory_order_relaxed); }
+  // Notify that one of the workers switched to the work stealing mode.
+  void NotifyWorkStealingWorker();
+  // Decrements the number of work stealing workers by at most `max_workers` and
+  // returns the number of decremented work stealing workers.
+  size_t DecrementWorkStealingWorkers(size_t max_workers);
   absl::FixedArray<Partition, 32> partitions_;
   alignas(kAtomicAlignment) std::atomic<bool> empty_;
+  alignas(kAtomicAlignment) std::atomic<size_t> num_work_stealing_workers_;
 };
 // Worker processes tasks from the work queue starting from the assigned
@@ -130,10 +144,14 @@ inline void WorkQueue::Partition::Initialize(size_t begin, size_t end) {
 }
 inline WorkQueue::WorkQueue(size_t num_tasks, size_t num_partitions)
-    : partitions_(num_partitions), empty_(num_tasks == 0) {
-  size_t partition_size = tsl::MathUtil::CeilOfRatio(num_tasks, num_partitions);
-  for (size_t i = 0, begin = 0, end = partition_size; i < num_partitions;
-       ++i, begin = end, end = std::min(num_tasks, end + partition_size)) {
+    : partitions_(num_partitions),
+      empty_(num_tasks == 0),
+      num_work_stealing_workers_(0) {
+  size_t partition_size =
+      tsl::MathUtil::FloorOfRatio(num_tasks, num_partitions);
+  size_t rem_tasks = num_tasks % num_partitions;
+  for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
+    end = begin + partition_size + ((i < rem_tasks) ? 1 : 0);
     partitions_[i].Initialize(begin, end);
   }
 }
@@ -154,6 +172,29 @@ inline std::optional<size_t> WorkQueue::Pop(size_t partition_index) {
                                                     : std::make_optional(index);
 }
+inline std::pair<size_t, size_t> WorkQueue::partition_range(
+    size_t partition_index) const {
+  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
+  return {partitions_[partition_index].begin, partitions_[partition_index].end};
+}
+inline void WorkQueue::NotifyWorkStealingWorker() {
+  num_work_stealing_workers_.fetch_add(1, std::memory_order_relaxed);
+}
+inline size_t WorkQueue::DecrementWorkStealingWorkers(size_t max_workers) {
+  size_t n = num_work_stealing_workers_.load(std::memory_order_relaxed);
+  size_t decrement = std::min(n, max_workers);
+  while (decrement && !num_work_stealing_workers_.compare_exchange_weak(
+                          n, n - decrement, std::memory_order_relaxed,
+                          std::memory_order_relaxed)) {
+    decrement = std::min(n, max_workers);
+  }
+  return decrement;
+}
 inline Worker::Worker(size_t worker_index, WorkQueue* queue)
     : worker_index_(worker_index),
       partition_index_(worker_index),
@@ -163,7 +204,13 @@ inline std::optional<size_t> Worker::Pop() {
   std::optional<size_t> task = queue_->Pop(partition_index_);
   if (ABSL_PREDICT_TRUE(task)) return task;
-  while (!task.has_value() && !queue_->empty()) {
+  // If we didn't find a task in the initially assigned partition, notify the
+  // work queue that we are switching to work stealing mode.
+  if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
+    queue_->NotifyWorkStealingWorker();
+  }
+  while (!task.has_value() && !queue_->IsEmpty()) {
     // Wrap around to the first partition.
     if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
       partition_index_ = 0;
@@ -171,7 +218,7 @@ inline std::optional<size_t> Worker::Pop() {
     // We checked all partitions and got back to the partition we started from.
     if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
-      queue_->empty_.store(true, std::memory_order_relaxed);
+      queue_->SetEmpty();
       break;
     }
@@ -205,6 +252,7 @@ Worker::ParallelizeContext<ParallelTask>::ParallelizeContext(
       parallel_task(std::forward<ParallelTask>(parallel_task)) {}
 template <typename ParallelTask>
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
 void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
                                     uint16_t start_index, uint16_t end_index) {
   DCHECK_LT(start_index, end_index) << "Invalid worker index range";
@@ -223,11 +271,26 @@ void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
   while (end_index - start_index > 1) {
     // If work queue is empty, we don't need to keep enqueuing more workers and
     // can simply count down for the remaining workers.
-    if (ABSL_PREDICT_FALSE(ctx->work_queue.empty())) {
+    if (ABSL_PREDICT_FALSE(ctx->work_queue.IsEmpty())) {
       count_down(end_index - start_index, absl::OkStatus());
       return;
     }
+    // If we have workers in the work stealing mode, we can skip enqueuing
+    // more tasks as existing workers will process remaining partitions. By
+    // doing this optimization we avoid unnecessary thread pool overheads.
+    size_t skip_workers =
+        ctx->work_queue.DecrementWorkStealingWorkers(end_index - start_index);
+    if (ABSL_PREDICT_FALSE(skip_workers > 0)) {
+      DCHECK_LE(skip_workers, end_index - start_index);
+      count_down(skip_workers, absl::OkStatus());
+      end_index -= skip_workers;
+      if (start_index == end_index) return;
+      if (end_index - start_index == 1) break;
+    }
+    DCHECK_GE(end_index - start_index, 1);
     uint16_t mid_index = (start_index + end_index) / 2;
     ctx->device->enqueueNoNotification([ctx, mid_index, end_index] {
       ParallelizeWithContext(ctx, mid_index, end_index);

tensorflow/include/xla/codegen/kernel_spec.h CHANGED Viewed

@@ -17,12 +17,14 @@ limitations under the License.
 #define XLA_CODEGEN_KERNEL_SPEC_H_
 #include <cstddef>
+#include <cstdint>
 #include <optional>
 #include <string>
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
-#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/launch_dim.h"
 namespace xla {
@@ -33,15 +35,17 @@ namespace xla {
 // will load kernel PTX on device and instantiate a KernelThunk.
 class KernelSpec {
  public:
-  using BufferUses = absl::InlinedVector<BufferUse, 8>;
+  using Buffers = absl::InlinedVector<BufferAllocation::Slice, 8>;
   KernelSpec(absl::string_view name, se::ThreadDim thread_dim,
-             BufferUses buffer_uses,
+             Buffers argument_buffers, Buffers result_buffers,
+             absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
   KernelSpec(absl::string_view name, se::ClusterDim cluster_dim,
              se::BlockDim block_dim, se::ThreadDim thread_dim,
-             BufferUses buffer_uses,
+             Buffers argument_buffers, Buffers result_buffers,
+             absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
   // Get the backend specific name of the kernel.
@@ -67,15 +71,28 @@ class KernelSpec {
   // managed buffer that is likely to be in L1/L2 cache).
   std::optional<size_t> scratch_bytes() const { return scratch_bytes_; }
-  // Buffers (buffer allocation slices) used by the kernel.
-  const BufferUses& buffer_uses() const { return buffer_uses_; }
+  // Argument buffers read by the kernel.
+  const Buffers& argument_buffers() const { return argument_buffers_; }
+  // Result buffers written to by the kernel.
+  const Buffers& result_buffers() const { return result_buffers_; }
+  // Returns a set of invariant arguments (corresponding to the indices in the
+  // argument buffers list).
+  const absl::flat_hash_set<int64_t>& invariant_arguments() const {
+    return invariant_arguments_;
+  }
  private:
   std::string name_;
   se::ClusterDim cluster_dim_;
   se::BlockDim block_dim_;
   se::ThreadDim thread_dim_;
-  BufferUses buffer_uses_;
+  Buffers argument_buffers_;
+  Buffers result_buffers_;
+  absl::flat_hash_set<int64_t> invariant_arguments_;
   std::optional<size_t> scratch_bytes_;
 };

tensorflow/include/xla/hlo/ir/hlo_casting_utils.h CHANGED Viewed

@@ -44,28 +44,6 @@ T* Cast(HloInstruction* instr) {
   return tsl::down_cast<T*>(instr);
 }
-// Downcasts a const HloInstruction pointer or returns nullptr if argument is
-// nullptr. Dies if TargetClass::ClassOf() does not match.
-template <typename T>
-const T* CastOrNull(const HloInstruction* i) {
-  if (i == nullptr) {
-    return nullptr;
-  }
-  CHECK(T::ClassOf(i));
-  return tsl::down_cast<const T*>(i);
-}
-// Downcasts a const HloInstruction pointer or returns nullptr if argument is
-// nullptr. Dies if TargetClass::ClassOf() does not match.
-template <typename T>
-T* CastOrNull(HloInstruction* i) {
-  if (i == nullptr) {
-    return nullptr;
-  }
-  CHECK(T::ClassOf(i));
-  return tsl::down_cast<T*>(i);
-}
 // Downcasts a const HloInstruction pointer or returns nullptr if
 // TargetClass::ClassOf() does not match. Dies if argument is nullptr. Similar
 // to LLVM's dyn_cast.
@@ -84,28 +62,6 @@ T* DynCast(HloInstruction* i) {
   return !T::ClassOf(i) ? nullptr : tsl::down_cast<T*>(i);
 }
-// Downcasts a const HloInstruction pointer. Return nullptr if argument is
-// nullptr orTargetClass::ClassOf() does not match. Similar to LLVM's
-// dyn_cast_or_null.
-template <typename T>
-const T* DynCastOrNull(const HloInstruction* instruction) {
-  if (instruction == nullptr || !T::ClassOf(instruction)) {
-    return nullptr;
-  }
-  return tsl::down_cast<const T*>(instruction);
-}
-// Downcasts a non-const HloInstruction pointer. Return nullptr if argument is
-// nullptr orTargetClass::ClassOf() does not match. Similar to LLVM's
-// dyn_cast_or_null.
-template <typename T>
-T* DynCastOrNull(HloInstruction* instruction) {
-  if (instruction == nullptr || !T::ClassOf(instruction)) {
-    return nullptr;
-  }
-  return tsl::down_cast<T*>(instruction);
-}
 }  // namespace xla
 #endif  // XLA_HLO_IR_HLO_CASTING_UTILS_H_