optimum-rbln 0.7.3.post2__tar.gz → 0.7.4a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. optimum_rbln-0.7.4a0/.github/version.yaml +1 -0
  2. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/PKG-INFO +1 -1
  3. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/__version__.py +2 -2
  4. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/ops/__init__.py +1 -1
  5. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/ops/attn.py +9 -7
  6. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/bart/modeling_bart.py +2 -0
  7. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +12 -12
  8. optimum_rbln-0.7.4a0/src/optimum/rbln/transformers/models/t5/modeling_t5.py +210 -0
  9. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/t5/t5_architecture.py +9 -3
  10. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/whisper/modeling_whisper.py +12 -0
  11. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/whisper/whisper_architecture.py +13 -3
  12. optimum_rbln-0.7.3.post2/.github/version.yaml +0 -1
  13. optimum_rbln-0.7.3.post2/src/optimum/rbln/transformers/models/t5/modeling_t5.py +0 -417
  14. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  15. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  16. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  17. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/ISSUE_TEMPLATE/model_request.md +0 -0
  18. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/pull_request_template.md +0 -0
  19. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/scripts/auto_code_review.py +0 -0
  20. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/scripts/validate_pr_checklist.py +0 -0
  21. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/auto_code_review.yml +0 -0
  22. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/check_code_quality.yml +0 -0
  23. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/deploy-on-tag.yaml +0 -0
  24. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/deploy.yaml +0 -0
  25. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/pr-title-check.yaml +0 -0
  26. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/pr_checklist_validator.yml +0 -0
  27. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/rbln_check_compiler.yaml +0 -0
  28. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/rbln_dispatch_pytest.yaml +0 -0
  29. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/rbln_optimum_inference_test.yaml +0 -0
  30. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/rbln_optimum_pytest.yaml +0 -0
  31. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/rbln_scheduled_test.yaml +0 -0
  32. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.github/workflows/rbln_trigger_on_pr.yaml +0 -0
  33. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/.gitignore +0 -0
  34. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/CODE_OF_CONDUCT.md +0 -0
  35. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/CONTRIBUTING.md +0 -0
  36. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/LICENSE +0 -0
  37. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/README.md +0 -0
  38. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/assets/rbln_logo.png +0 -0
  39. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/audio-classification/run_ast_audio_classification.py +0 -0
  40. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/depth-estimation/run_dpt.py +0 -0
  41. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/image-classification/run_image_classification.py +0 -0
  42. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/image-classification/run_vit_image_classification.py +0 -0
  43. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/image-to-text/run_llava_next_image_to_text.py +0 -0
  44. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2.py +0 -0
  45. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2_combined.py +0 -0
  46. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2_img2img.py +0 -0
  47. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2_img2img_combined.py +0 -0
  48. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2_inpaint.py +0 -0
  49. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2_inpaint_combined.py +0 -0
  50. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/kandinsky2_2/run_kandinsky2_2_prior_interpolate.py +0 -0
  51. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/question-answering/run_question_answering.py +0 -0
  52. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/speech-recognition/run_wav2vec2.py +0 -0
  53. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/speech-recognition/run_whisper.py +0 -0
  54. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion.py +0 -0
  55. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion_controlnet.py +0 -0
  56. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion_img2img.py +0 -0
  57. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion_img2img_controlnet.py +0 -0
  58. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion_inpaint.py +0 -0
  59. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion_lora.py +0 -0
  60. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/stable-diffusion/run_stable_diffusion_multicontrolnet.py +0 -0
  61. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text-classification/run_bge_m3_text_classification.py +0 -0
  62. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text-classification/run_bge_reranker_v2_m3_text_classification.py +0 -0
  63. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text-classification/run_secureBERT.py +0 -0
  64. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text-classification/run_t5_classification.py +0 -0
  65. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text-classification/run_twitter_roberta_text_classification.py +0 -0
  66. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text2text-generation/run_bart_text2text_generation.py +0 -0
  67. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text2text-generation/run_llama_peft.py +0 -0
  68. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/examples/text2text-generation/run_llama_text2text_generation.py +0 -0
  69. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/pyproject.toml +0 -0
  70. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/scripts/uv-lock.sh +0 -0
  71. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/scripts/uv-sync.sh +0 -0
  72. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/__init__.py +0 -0
  73. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/__init__.py +0 -0
  74. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/modeling_diffusers.py +0 -0
  75. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/__init__.py +0 -0
  76. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -0
  77. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +0 -0
  78. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/autoencoders/vae.py +0 -0
  79. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/autoencoders/vq_model.py +0 -0
  80. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/controlnet.py +0 -0
  81. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/transformers/__init__.py +0 -0
  82. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/transformers/prior_transformer.py +0 -0
  83. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/transformers/transformer_sd3.py +0 -0
  84. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/unets/__init__.py +0 -0
  85. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/models/unets/unet_2d_condition.py +0 -0
  86. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/__init__.py +0 -0
  87. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/controlnet/__init__.py +0 -0
  88. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -0
  89. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +0 -0
  90. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +0 -0
  91. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +0 -0
  92. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +0 -0
  93. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/__init__.py +0 -0
  94. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +0 -0
  95. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +0 -0
  96. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +0 -0
  97. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py +0 -0
  98. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +0 -0
  99. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/__init__.py +0 -0
  100. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +0 -0
  101. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +0 -0
  102. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +0 -0
  103. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/__init__.py +0 -0
  104. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +0 -0
  105. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +0 -0
  106. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +0 -0
  107. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/__init__.py +0 -0
  108. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -0
  109. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -0
  110. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -0
  111. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/modeling.py +0 -0
  112. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/modeling_base.py +0 -0
  113. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/modeling_config.py +0 -0
  114. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/ops/flash_attn.py +0 -0
  115. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/ops/kv_cache_update.py +0 -0
  116. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/__init__.py +0 -0
  117. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/modeling_alias.py +0 -0
  118. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/modeling_generic.py +0 -0
  119. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/modeling_rope_utils.py +0 -0
  120. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/__init__.py +0 -0
  121. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/auto/__init__.py +0 -0
  122. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/auto/auto_factory.py +0 -0
  123. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/auto/modeling_auto.py +0 -0
  124. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/bart/__init__.py +0 -0
  125. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/bart/bart_architecture.py +0 -0
  126. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/bert/__init__.py +0 -0
  127. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/bert/modeling_bert.py +0 -0
  128. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/clip/__init__.py +0 -0
  129. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/clip/modeling_clip.py +0 -0
  130. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/decoderonly/__init__.py +0 -0
  131. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +0 -0
  132. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +0 -0
  133. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/dpt/__init__.py +0 -0
  134. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -0
  135. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/exaone/__init__.py +0 -0
  136. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/exaone/exaone_architecture.py +0 -0
  137. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/exaone/modeling_exaone.py +0 -0
  138. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/gemma/__init__.py +0 -0
  139. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/gemma/gemma_architecture.py +0 -0
  140. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/gemma/modeling_gemma.py +0 -0
  141. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/gpt2/__init__.py +0 -0
  142. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +0 -0
  143. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +0 -0
  144. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/llama/__init__.py +0 -0
  145. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/llama/llama_architecture.py +0 -0
  146. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/llama/modeling_llama.py +0 -0
  147. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/llava_next/__init__.py +0 -0
  148. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +0 -0
  149. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/midm/__init__.py +0 -0
  150. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/midm/midm_architecture.py +0 -0
  151. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/midm/modeling_midm.py +0 -0
  152. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/mistral/__init__.py +0 -0
  153. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/mistral/mistral_architecture.py +0 -0
  154. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -0
  155. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/phi/__init__.py +0 -0
  156. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/phi/modeling_phi.py +0 -0
  157. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/phi/phi_architecture.py +0 -0
  158. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/qwen2/__init__.py +0 -0
  159. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -0
  160. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/qwen2/qwen2_architecture.py +0 -0
  161. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/seq2seq/__init__.py +0 -0
  162. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +0 -0
  163. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/t5/__init__.py +0 -0
  164. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/wav2vec2/__init__.py +0 -0
  165. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +0 -0
  166. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/whisper/__init__.py +0 -0
  167. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/whisper/generation_whisper.py +0 -0
  168. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/xlm_roberta/__init__.py +0 -0
  169. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -0
  170. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/utils/__init__.py +0 -0
  171. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/transformers/utils/rbln_quantization.py +0 -0
  172. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/__init__.py +0 -0
  173. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/decorator_utils.py +0 -0
  174. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/hub.py +0 -0
  175. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/import_utils.py +0 -0
  176. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/logging.py +0 -0
  177. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/model_utils.py +0 -0
  178. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/runtime_utils.py +0 -0
  179. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/save_utils.py +0 -0
  180. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/src/optimum/rbln/utils/submodule.py +0 -0
  181. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/__init__.py +0 -0
  182. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/psnr.py +0 -0
  183. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/requirements_sdxl.txt +0 -0
  184. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/run_stable_diffusion_xl_base.py +0 -0
  185. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/test_base.py +0 -0
  186. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/test_diffusers.py +0 -0
  187. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/test_llm.py +0 -0
  188. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/tests/test_transformers.py +0 -0
  189. {optimum_rbln-0.7.3.post2 → optimum_rbln-0.7.4a0}/uv.lock +0 -0
@@ -0,0 +1 @@
1
+ rebel_compiler_version: 0.7.4.dev22+gcaaa7596
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: optimum-rbln
3
- Version: 0.7.3.post2
3
+ Version: 0.7.4a0
4
4
  Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
5
5
  Project-URL: Homepage, https://rebellions.ai
6
6
  Project-URL: Documentation, https://docs.rbln.ai
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.7.3.post2'
21
- __version_tuple__ = version_tuple = (0, 7, 3)
20
+ __version__ = version = '0.7.4a0'
21
+ __version_tuple__ = version_tuple = (0, 7, 4)
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from .attn import (
16
- register_rbln_custom_add_softmax_attention,
16
+ register_rbln_custom_paged_add_softmax_attention,
17
17
  register_rbln_custom_paged_attention,
18
18
  register_rbln_custom_paged_causal_attention,
19
19
  )
@@ -182,14 +182,14 @@ def register_rbln_custom_paged_causal_attention():
182
182
 
183
183
 
184
184
  @lru_cache
185
- def register_rbln_custom_add_softmax_attention():
185
+ def register_rbln_custom_paged_add_softmax_attention():
186
186
  torch.library.define(
187
- "rbln_custom_ops::add_softmax_attn_decode",
188
- "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor",
187
+ "rbln_custom_ops::paged_add_softmax_attn_decode",
188
+ "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor",
189
189
  )
190
190
 
191
- @torch.library.impl("rbln_custom_ops::add_softmax_attn_decode", "cpu")
192
- def add_softmax_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
191
+ @torch.library.impl("rbln_custom_ops::paged_add_softmax_attn_decode", "cpu")
192
+ def paged_add_softmax_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
193
193
  """Defines the computation pattern for fused attention with KV cache updates.
194
194
 
195
195
  IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
@@ -210,12 +210,14 @@ def register_rbln_custom_add_softmax_attention():
210
210
  - vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
211
211
  - seq: [1] - Current sequence position
212
212
  - scale: [] - Attention scale factor
213
+ - block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
214
+ - block_size: [] - Number of tokens per block
213
215
 
214
216
  Returns:
215
217
  Tensor: attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
216
218
  """
217
219
  return q
218
220
 
219
- @register_fake("rbln_custom_ops::add_softmax_attn_decode")
220
- def add_softmax_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
221
+ @register_fake("rbln_custom_ops::paged_add_softmax_attn_decode")
222
+ def paged_add_softmax_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition, block_table, block_size):
221
223
  return q
@@ -108,6 +108,8 @@ class RBLNBartModel(RBLNModel):
108
108
 
109
109
 
110
110
  class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
111
+ support_causal_attn = True
112
+
111
113
  @classmethod
112
114
  def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
113
115
  enc_max_seq_len = (
@@ -94,7 +94,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
94
94
  decoder_attention_mask if self.use_attention_mask else None,
95
95
  attention_mask,
96
96
  cache_position,
97
- block_tables,
97
+ block_tables=block_tables,
98
98
  )
99
99
 
100
100
  return Seq2SeqLMOutput(logits=lm_logits)
@@ -115,6 +115,7 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
115
115
 
116
116
  main_input_name = "input_ids"
117
117
  auto_model_class = AutoModelForSeq2SeqLM
118
+ support_causal_attn = None
118
119
 
119
120
  def __post_init__(self, **kwargs):
120
121
  batch_size = self.rbln_config.model_cfg["batch_size"]
@@ -186,13 +187,16 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
186
187
  rbln_dec_max_seq_len = rbln_kwargs.get("dec_max_seq_len", None)
187
188
  rbln_batch_size = rbln_kwargs.get("batch_size", None)
188
189
  rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
189
- rbln_use_attention_mask = rbln_kwargs.get("use_attention_mask", None)
190
190
 
191
- if rbln_use_attention_mask is None:
192
- rbln_use_attention_mask = False
193
- rbln_npu = rbln_kwargs.get("npu", None) or rebel.get_npu_name()
194
- if rbln_npu == "RBLN-CA02":
195
- rbln_use_attention_mask = True
191
+ if cls.support_causal_attn:
192
+ rbln_use_attention_mask = rbln_kwargs.get("use_attention_mask", None)
193
+ if rbln_use_attention_mask is None:
194
+ rbln_use_attention_mask = False
195
+ rbln_npu = rbln_kwargs.get("npu", None) or rebel.get_npu_name()
196
+ if rbln_npu == "RBLN-CA02":
197
+ rbln_use_attention_mask = True
198
+ else:
199
+ rbln_use_attention_mask = True
196
200
 
197
201
  n_layer = getattr(model_config, "decoder_layers", None) or getattr(model_config, "num_layers")
198
202
  n_head = getattr(model_config, "decoder_attention_heads", None) or getattr(model_config, "num_heads")
@@ -265,11 +269,7 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
265
269
  [rbln_batch_size, 1],
266
270
  "int32",
267
271
  ),
268
- (
269
- "block_tables",
270
- [rbln_batch_size, 1],
271
- "int16",
272
- ),
272
+ ("block_tables", [rbln_batch_size, 1], "int16"),
273
273
  ]
274
274
  dec_input_info.extend(
275
275
  [
@@ -0,0 +1,210 @@
1
+ # Copyright 2025 Rebellions Inc. All rights reserved.
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
17
+
18
+ import torch
19
+ from transformers import (
20
+ AutoModelForTextEncoding,
21
+ PretrainedConfig,
22
+ T5EncoderModel,
23
+ T5ForConditionalGeneration,
24
+ )
25
+ from transformers.modeling_outputs import BaseModelOutput
26
+
27
+ from ....diffusers.modeling_diffusers import RBLNDiffusionMixin
28
+ from ....modeling import RBLNModel
29
+ from ....modeling_config import RBLNCompileConfig, RBLNConfig
30
+ from ....utils.logging import get_logger
31
+ from ....utils.runtime_utils import RBLNPytorchRuntime
32
+ from ...models.seq2seq import RBLNModelForSeq2SeqLM
33
+ from .t5_architecture import T5Wrapper
34
+
35
+
36
+ logger = get_logger()
37
+
38
+ if TYPE_CHECKING:
39
+ from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
40
+
41
+
42
+ class RBLNRuntimeModel(RBLNPytorchRuntime):
43
+ def forward(
44
+ self,
45
+ input_ids: torch.LongTensor,
46
+ attention_mask: torch.FloatTensor,
47
+ head_mask: torch.FloatTensor,
48
+ inputs_embeds: torch.FloatTensor,
49
+ **kwargs,
50
+ ):
51
+ return super().forward(
52
+ input_ids,
53
+ attention_mask,
54
+ head_mask,
55
+ inputs_embeds,
56
+ **kwargs,
57
+ )
58
+
59
+
60
+ class T5EncoderWrapper(torch.nn.Module):
61
+ def __init__(self, model: "T5EncoderModel") -> None:
62
+ super().__init__()
63
+ self.model = model
64
+
65
+ def forward(self, *args, **kwargs):
66
+ kwargs.pop("return_dict", None)
67
+ return self.model(*args, **kwargs, return_dict=False)
68
+
69
+
70
+ class RBLNT5EncoderModel(RBLNModel):
71
+ auto_model_class = AutoModelForTextEncoding
72
+ rbln_model_input_names = ["input_ids", "attention_mask"]
73
+
74
+ def __post_init__(self, **kwargs):
75
+ self.model = RBLNRuntimeModel(runtime=self.model[0])
76
+
77
+ @classmethod
78
+ def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
79
+ return T5EncoderWrapper(model)
80
+
81
+ @classmethod
82
+ def update_rbln_config_using_pipe(cls, pipe: RBLNDiffusionMixin, rbln_config: Dict[str, Any]) -> Dict[str, Any]:
83
+ batch_size = rbln_config.get("batch_size", 1)
84
+ max_sequence_length = rbln_config.get("max_sequence_length", 256)
85
+ model_input_names = ["input_ids"]
86
+
87
+ rbln_config.update(
88
+ {
89
+ "batch_size": batch_size,
90
+ "max_seq_len": max_sequence_length,
91
+ "model_input_names": model_input_names,
92
+ }
93
+ )
94
+
95
+ return rbln_config
96
+
97
+ @classmethod
98
+ def _get_rbln_config(
99
+ cls,
100
+ preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
101
+ model_config: Optional["PretrainedConfig"] = None,
102
+ rbln_kwargs: Dict[str, Any] = {},
103
+ ) -> RBLNConfig:
104
+ rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
105
+ rbln_model_input_names = rbln_kwargs.get("model_input_names", None)
106
+ rbln_batch_size = rbln_kwargs.get("batch_size", None)
107
+
108
+ max_position_embeddings = getattr(model_config, "n_positions", None)
109
+
110
+ if rbln_max_seq_len is None:
111
+ rbln_max_seq_len = max_position_embeddings
112
+ if rbln_max_seq_len is None:
113
+ for tokenizer in preprocessors:
114
+ if hasattr(tokenizer, "model_max_length"):
115
+ rbln_max_seq_len = tokenizer.model_max_length
116
+ break
117
+ if rbln_max_seq_len is None:
118
+ raise ValueError("`rbln_max_seq_len` should be specified!")
119
+
120
+ if max_position_embeddings is not None and rbln_max_seq_len > max_position_embeddings:
121
+ raise ValueError("`rbln_max_seq_len` should be less or equal than max_position_embeddings!")
122
+
123
+ signature_params = inspect.signature(cls.get_hf_class().forward).parameters.keys()
124
+
125
+ if rbln_model_input_names is None:
126
+ for tokenizer in preprocessors:
127
+ if hasattr(tokenizer, "model_input_names"):
128
+ rbln_model_input_names = [name for name in signature_params if name in tokenizer.model_input_names]
129
+
130
+ invalid_params = set(rbln_model_input_names) - set(signature_params)
131
+ if invalid_params:
132
+ raise ValueError(f"Invalid model input names: {invalid_params}")
133
+ break
134
+ if rbln_model_input_names is None and hasattr(cls, "rbln_model_input_names"):
135
+ rbln_model_input_names = cls.rbln_model_input_names
136
+ elif rbln_model_input_names is None and hasattr(cls, "rbln_model_input_names") is False:
137
+ raise ValueError(
138
+ "Specify the model input names obtained by the tokenizer via `rbln_model_input_names`, "
139
+ f"and be sure to make the order of the inputs same as T5EncoderModel forward() arguments like ({list(signature_params)})"
140
+ )
141
+ else:
142
+ invalid_params = set(rbln_model_input_names) - set(signature_params)
143
+ if invalid_params:
144
+ raise ValueError(f"Invalid model input names: {invalid_params}")
145
+ rbln_model_input_names = [name for name in signature_params if name in rbln_model_input_names]
146
+
147
+ if rbln_batch_size is None:
148
+ rbln_batch_size = 1
149
+
150
+ input_info = [
151
+ (model_input_name, [rbln_batch_size, rbln_max_seq_len], "int64")
152
+ for model_input_name in rbln_model_input_names
153
+ ]
154
+
155
+ rbln_compile_config = RBLNCompileConfig(input_info=input_info)
156
+
157
+ rbln_config = RBLNConfig(
158
+ rbln_cls=cls.__name__,
159
+ compile_cfgs=[rbln_compile_config],
160
+ rbln_kwargs=rbln_kwargs,
161
+ )
162
+
163
+ rbln_config.model_cfg.update({"max_seq_len": rbln_max_seq_len})
164
+ return rbln_config
165
+
166
+ def forward(
167
+ self,
168
+ input_ids: Optional[torch.LongTensor] = None,
169
+ attention_mask: Optional[torch.FloatTensor] = None,
170
+ head_mask: Optional[torch.FloatTensor] = None,
171
+ inputs_embeds: Optional[torch.FloatTensor] = None,
172
+ output_attentions: Optional[bool] = None,
173
+ output_hidden_states: Optional[bool] = None,
174
+ return_dict: Optional[bool] = None,
175
+ ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
176
+ encoder_outputs = self.model(
177
+ input_ids=input_ids,
178
+ attention_mask=attention_mask,
179
+ inputs_embeds=inputs_embeds,
180
+ head_mask=head_mask,
181
+ output_attentions=output_attentions,
182
+ output_hidden_states=output_hidden_states,
183
+ return_dict=return_dict,
184
+ )
185
+ if not return_dict:
186
+ return (encoder_outputs,)
187
+ else:
188
+ return BaseModelOutput(last_hidden_state=encoder_outputs)
189
+
190
+
191
+ class RBLNT5ForConditionalGeneration(RBLNModelForSeq2SeqLM):
192
+ support_causal_attn = False
193
+
194
+ @classmethod
195
+ def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
196
+ enc_max_seq_len = rbln_config.model_cfg["enc_max_seq_len"]
197
+ dec_max_seq_len = rbln_config.model_cfg["dec_max_seq_len"]
198
+
199
+ return T5Wrapper(model, enc_max_seq_len=enc_max_seq_len, dec_max_seq_len=dec_max_seq_len)
200
+
201
+ def __getattr__(self, __name: str) -> Any:
202
+ def redirect(func):
203
+ return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
204
+
205
+ val = getattr(T5ForConditionalGeneration, __name)
206
+
207
+ if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
208
+ return redirect(val)
209
+
210
+ return val
@@ -18,7 +18,7 @@ import torch
18
18
  from torch import nn
19
19
  from transformers.utils import logging
20
20
 
21
- from ....ops import register_rbln_custom_add_softmax_attention
21
+ from ....ops import register_rbln_custom_paged_add_softmax_attention
22
22
  from ..seq2seq.seq2seq_architecture import (
23
23
  Seq2SeqDecoder,
24
24
  Seq2SeqDecoderLayer,
@@ -55,7 +55,7 @@ class T5EncoderWrapper(Seq2SeqEncoderWrapper):
55
55
 
56
56
  class T5DecoderWrapper(Seq2SeqDecoderWrapper):
57
57
  def __post_init__(self, model, dec_max_seq_len: int = None):
58
- register_rbln_custom_add_softmax_attention()
58
+ register_rbln_custom_paged_add_softmax_attention()
59
59
  self.num_layers = self.config.num_layers
60
60
  self.conditional_generation = self.convert_to_rbln_conditional_generation(model, dec_max_seq_len)
61
61
 
@@ -77,6 +77,7 @@ class T5DecoderWrapper(Seq2SeqDecoderWrapper):
77
77
  attention_mask,
78
78
  encoder_attention_mask,
79
79
  cache_position,
80
+ block_tables,
80
81
  cross_kv_cache,
81
82
  *self_kv_cache,
82
83
  ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor]]:
@@ -95,6 +96,7 @@ class T5DecoderWrapper(Seq2SeqDecoderWrapper):
95
96
  self_past_key_values=self_past_key_values,
96
97
  cross_past_key_values=cross_past_key_values,
97
98
  cache_position=cache_position,
99
+ block_tables=block_tables,
98
100
  )
99
101
 
100
102
  return lm_logits
@@ -162,7 +164,7 @@ class T5LayerSelfAttention(Seq2SeqSelfAttention):
162
164
  self.out_proj = self._original_mod.o
163
165
  self.num_heads = self._original_mod.n_heads
164
166
  self.head_dim = self._original_mod.key_value_proj_dim
165
- self.attn_decode = torch.ops.rbln_custom_ops.add_softmax_attn_decode
167
+ self.attn_decode = torch.ops.rbln_custom_ops.paged_add_softmax_attn_decode
166
168
 
167
169
  def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
168
170
  query_states = self.q_proj(hidden_states)
@@ -176,6 +178,7 @@ class T5LayerSelfAttention(Seq2SeqSelfAttention):
176
178
  past_key_value: Tuple[torch.Tensor],
177
179
  attention_mask: torch.Tensor,
178
180
  cache_position: torch.Tensor,
181
+ block_tables: torch.Tensor,
179
182
  **kwargs,
180
183
  ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
181
184
  bsz, tgt_len, _ = hidden_states.size()
@@ -185,6 +188,7 @@ class T5LayerSelfAttention(Seq2SeqSelfAttention):
185
188
  key_states = self._shape(key_states, -1, bsz)
186
189
  value_states = self._shape(value_states, -1, bsz)
187
190
 
191
+ block_size = past_key_value[0].shape[-2]
188
192
  attn_output = self.attn_decode(
189
193
  query_states,
190
194
  key_states,
@@ -196,6 +200,8 @@ class T5LayerSelfAttention(Seq2SeqSelfAttention):
196
200
  past_key_value[1].view(bsz, self.num_heads, 1, -1, self.head_dim),
197
201
  cache_position,
198
202
  torch.tensor(1.0, dtype=torch.float32), # scale
203
+ block_tables,
204
+ block_size,
199
205
  )
200
206
 
201
207
  attn_output = attn_output.view(bsz, self.num_heads, -1, self.head_dim).transpose(1, 2)
@@ -61,6 +61,16 @@ class RBLNRuntimeEncoder(RBLNPytorchRuntime):
61
61
  class RBLNRuntimeDecoder(RBLNPytorchRuntime):
62
62
  mandatory_members = ["main_input_name"]
63
63
 
64
+ def __init__(
65
+ self,
66
+ runtime: rebel.Runtime,
67
+ batch_size: int,
68
+ **kwargs: Any,
69
+ ) -> None:
70
+ super().__init__(runtime, **kwargs)
71
+ self.batch_size = batch_size
72
+ self.default_block_tables = torch.arange(0, self.batch_size, dtype=torch.int16).view(self.batch_size, 1)
73
+
64
74
  def forward(
65
75
  self,
66
76
  decoder_input_ids: torch.Tensor = None,
@@ -76,6 +86,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
76
86
  decoder_input_ids=decoder_input_ids,
77
87
  decoder_attention_mask=decoder_attention_mask,
78
88
  cache_position=cache_position,
89
+ block_tables=self.default_block_tables,
79
90
  )
80
91
 
81
92
  if isinstance(outputs, torch.Tensor):
@@ -237,6 +248,7 @@ class RBLNWhisperForConditionalGeneration(RBLNModel, RBLNWhisperGenerationMixin)
237
248
  ("decoder_input_ids", [rbln_batch_size, 1], "int64"),
238
249
  ("decoder_attention_mask", [rbln_batch_size, rbln_dec_max_seq_len], "int64"),
239
250
  ("cache_position", [], "int32"),
251
+ ("block_tables", [rbln_batch_size, 1], "int16"),
240
252
  ]
241
253
  dec_input_info.extend(
242
254
  [
@@ -25,7 +25,7 @@ from transformers.modeling_outputs import (
25
25
  )
26
26
  from transformers.utils import logging
27
27
 
28
- from ....ops import register_rbln_custom_add_softmax_attention, register_rbln_custom_cache_update
28
+ from ....ops import register_rbln_custom_cache_update, register_rbln_custom_paged_add_softmax_attention
29
29
 
30
30
 
31
31
  logger = logging.get_logger(__name__)
@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
34
34
  class WhisperWrapper:
35
35
  def __init__(self, model, rbln_token_timestamps):
36
36
  register_rbln_custom_cache_update()
37
- register_rbln_custom_add_softmax_attention()
37
+ register_rbln_custom_paged_add_softmax_attention()
38
38
  self.encoder = WhisperEncoderWrapper(model)
39
39
  self.decoder = WhisperDecoderWrapper(model, output_attentions=rbln_token_timestamps)
40
40
 
@@ -108,6 +108,7 @@ class WhisperDecoderWrapper(torch.nn.Module):
108
108
  decoder_input_ids: torch.Tensor,
109
109
  decoder_attention_mask: torch.Tensor,
110
110
  cache_position: torch.Tensor,
111
+ block_tables: torch.Tensor,
111
112
  cross_kv_cache: torch.Tensor,
112
113
  *self_kv_cache: torch.Tensor,
113
114
  ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
@@ -125,6 +126,7 @@ class WhisperDecoderWrapper(torch.nn.Module):
125
126
  cache_position=cache_position,
126
127
  self_past_key_values=self_past_key_values,
127
128
  cross_past_key_values=cross_past_key_values,
129
+ block_tables=block_tables,
128
130
  )
129
131
 
130
132
  lm_logits = self.proj_out(sequence_output)
@@ -154,6 +156,7 @@ class WhisperDecoder(nn.Module):
154
156
  self_past_key_values: Optional[torch.Tensor] = None,
155
157
  cross_past_key_values: Optional[torch.Tensor] = None,
156
158
  cache_position: Optional[torch.Tensor] = None,
159
+ block_tables: Optional[torch.Tensor] = None,
157
160
  ):
158
161
  input_shape = input_ids.size()
159
162
  input_ids = input_ids.view(-1, input_shape[-1])
@@ -177,6 +180,7 @@ class WhisperDecoder(nn.Module):
177
180
  self_past_key_value=self_past_key_value,
178
181
  cross_past_key_value=cross_past_key_value,
179
182
  cache_position=cache_position,
183
+ block_tables=block_tables,
180
184
  )
181
185
  cross_attentions += (cross_attn_weights,)
182
186
 
@@ -205,6 +209,7 @@ class WhisperDecoderLayer(nn.Module):
205
209
  self_past_key_value: Optional[Tuple[torch.Tensor]] = None,
206
210
  cross_past_key_value: Optional[Tuple[torch.Tensor]] = None,
207
211
  cache_position: Optional[torch.Tensor] = None,
212
+ block_tables: Optional[torch.Tensor] = None,
208
213
  ) -> torch.Tensor:
209
214
  # Self Attention Block
210
215
  residual = hidden_states
@@ -214,6 +219,7 @@ class WhisperDecoderLayer(nn.Module):
214
219
  past_key_value=self_past_key_value,
215
220
  attention_mask=attention_mask,
216
221
  cache_position=cache_position,
222
+ block_tables=block_tables,
217
223
  )
218
224
  hidden_states = residual + hidden_states
219
225
 
@@ -263,6 +269,7 @@ class WhisperSelfAttention(WhisperAttention):
263
269
  past_key_value: Optional[Tuple[torch.Tensor]] = None,
264
270
  attention_mask: Optional[torch.Tensor] = None,
265
271
  cache_position: Optional[torch.Tensor] = None,
272
+ block_tables: Optional[torch.Tensor] = None,
266
273
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
267
274
  bsz, tgt_len, _ = hidden_states.size()
268
275
  query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
@@ -270,8 +277,9 @@ class WhisperSelfAttention(WhisperAttention):
270
277
 
271
278
  key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
272
279
  value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
280
+ block_size = past_key_value[0].shape[-2]
273
281
 
274
- attn_output = torch.ops.rbln_custom_ops.add_softmax_attn_decode(
282
+ attn_output = torch.ops.rbln_custom_ops.paged_add_softmax_attn_decode(
275
283
  query_states,
276
284
  key_states,
277
285
  value_states,
@@ -280,6 +288,8 @@ class WhisperSelfAttention(WhisperAttention):
280
288
  past_key_value[1].view(bsz, self.num_heads, 1, -1, self.head_dim),
281
289
  cache_position.expand(bsz, 1),
282
290
  torch.tensor(1.0, dtype=torch.float32), # scale
291
+ block_tables,
292
+ block_size,
283
293
  )
284
294
 
285
295
  attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
@@ -1 +0,0 @@
1
- rebel_compiler_version: 0.7.3