optimum-rbln 0.7.3a1__tar.gz → 0.7.3a3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimum_rbln-0.7.3a3/.github/version.yaml +1 -0
- optimum_rbln-0.7.3a3/.github/workflows/rbln_check_compiler.yaml +61 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/rbln_optimum_pytest.yaml +8 -3
- optimum_rbln-0.7.3a3/.github/workflows/rbln_scheduled_test.yaml +51 -0
- optimum_rbln-0.7.3a3/.github/workflows/rbln_trigger_on_pr.yaml +94 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/PKG-INFO +1 -1
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/__version__.py +2 -2
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/ops/__init__.py +4 -4
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/ops/attn.py +44 -84
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/ops/flash_attn.py +25 -25
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/bart/bart_architecture.py +10 -6
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/bart/modeling_bart.py +3 -1
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +79 -51
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +157 -34
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/exaone/exaone_architecture.py +7 -2
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/gemma/gemma_architecture.py +7 -2
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +3 -1
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/midm/midm_architecture.py +3 -1
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/phi/phi_architecture.py +5 -3
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +44 -13
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +50 -19
- optimum_rbln-0.7.3a3/src/optimum/rbln/transformers/models/t5/modeling_t5.py +417 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/t5/t5_architecture.py +69 -3
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/whisper/whisper_architecture.py +19 -24
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/test_llm.py +18 -22
- optimum_rbln-0.7.3a1/.github/workflows/rbln_trigger_on_pr.yaml +0 -96
- optimum_rbln-0.7.3a1/src/optimum/rbln/transformers/models/t5/modeling_t5.py +0 -208
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/ISSUE_TEMPLATE/model_request.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/pull_request_template.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/scripts/auto_code_review.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/scripts/validate_pr_checklist.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/auto_code_review.yml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/check_code_quality.yml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/deploy-on-tag.yaml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/deploy.yaml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/pr-title-check.yaml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/pr_checklist_validator.yml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/rbln_dispatch_pytest.yaml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.github/workflows/rbln_optimum_inference_test.yaml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/.gitignore +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/CODE_OF_CONDUCT.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/CONTRIBUTING.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/LICENSE +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/README.md +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/assets/rbln_logo.png +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/audio-classification/run_ast_audio_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/depth-estimation/run_dpt.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/image-classification/run_image_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/image-classification/run_vit_image_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/image-to-text/run_llava_next_image_to_text.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/kandinsky2_2/run_kandinsky2_2_inpaint.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/kandinsky2_2/run_kandinsky2_2_inpaint_combined.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/question-answering/run_question_answering.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/speech-recognition/run_wav2vec2.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/speech-recognition/run_whisper.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion_controlnet.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion_img2img.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion_img2img_controlnet.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion_inpaint.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion_lora.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/stable-diffusion/run_stable_diffusion_multicontrolnet.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text-classification/run_bge_m3_text_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text-classification/run_bge_reranker_v2_m3_text_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text-classification/run_secureBERT.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text-classification/run_t5_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text-classification/run_twitter_roberta_text_classification.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text2text-generation/run_bart_text2text_generation.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text2text-generation/run_llama_peft.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/examples/text2text-generation/run_llama_text2text_generation.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/pyproject.toml +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/scripts/uv-lock.sh +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/scripts/uv-sync.sh +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/modeling_diffusers.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/autoencoders/vae.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/autoencoders/vq_model.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/controlnet.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/transformers/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/transformers/prior_transformer.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/transformers/transformer_sd3.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/unets/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/models/unets/unet_2d_condition.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/controlnet/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/modeling.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/modeling_base.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/modeling_config.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/ops/kv_cache_update.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/modeling_alias.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/modeling_generic.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/modeling_rope_utils.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/auto/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/auto/auto_factory.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/auto/modeling_auto.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/bart/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/bert/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/bert/modeling_bert.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/clip/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/clip/modeling_clip.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/decoderonly/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/dpt/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/exaone/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/exaone/modeling_exaone.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/gemma/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/gemma/modeling_gemma.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/gpt2/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/llama/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/llama/llama_architecture.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/llama/modeling_llama.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/llava_next/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/midm/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/midm/modeling_midm.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/mistral/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/mistral/mistral_architecture.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/phi/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/phi/modeling_phi.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/qwen2/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/qwen2/qwen2_architecture.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/seq2seq/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/t5/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/wav2vec2/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/whisper/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/whisper/generation_whisper.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/whisper/modeling_whisper.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/xlm_roberta/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/utils/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/transformers/utils/rbln_quantization.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/decorator_utils.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/hub.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/import_utils.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/logging.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/model_utils.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/runtime_utils.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/save_utils.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/src/optimum/rbln/utils/submodule.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/__init__.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/psnr.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/requirements_sdxl.txt +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/run_stable_diffusion_xl_base.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/test_base.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/test_diffusers.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/tests/test_transformers.py +0 -0
- {optimum_rbln-0.7.3a1 → optimum_rbln-0.7.3a3}/uv.lock +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
rebel_compiler_version: 0.7.3.dev150+g305d8758
|
@@ -0,0 +1,61 @@
|
|
1
|
+
name: Check Rebel Compiler Version
|
2
|
+
|
3
|
+
on:
|
4
|
+
workflow_call:
|
5
|
+
inputs:
|
6
|
+
compiler_version:
|
7
|
+
description: "Rebel compiler version to check"
|
8
|
+
required: true
|
9
|
+
type: string
|
10
|
+
outputs:
|
11
|
+
is_deploy_version:
|
12
|
+
description: "Whether the compiler is a deploy version"
|
13
|
+
value: ${{ jobs.check-rebel-compiler-version.outputs.is_deploy_version }}
|
14
|
+
compiler_version:
|
15
|
+
description: "The compiler version used"
|
16
|
+
value: ${{ jobs.check-rebel-compiler-version.outputs.compiler_version }}
|
17
|
+
|
18
|
+
jobs:
|
19
|
+
check-rebel-compiler-version:
|
20
|
+
runs-on: rebel-k8s-runner
|
21
|
+
outputs:
|
22
|
+
is_deploy_version: ${{ steps.check_prod.outputs.IS_PROD }}
|
23
|
+
compiler_version: ${{ steps.install_compiler.outputs.COMPILER_VERSION }}
|
24
|
+
steps:
|
25
|
+
- name: Set up Python
|
26
|
+
uses: actions/setup-python@v5
|
27
|
+
with:
|
28
|
+
python-version: "3.9"
|
29
|
+
|
30
|
+
- name: Install rebel-compiler
|
31
|
+
id: install_compiler
|
32
|
+
env:
|
33
|
+
REBEL_PYPI_ENDPOINT: ${{ vars.REBEL_PYPI_INTERNAL_ENDPOINT }}
|
34
|
+
REBEL_PYPI_USERNAME: ${{ secrets.REBEL_PYPI_USERNAME }}
|
35
|
+
REBEL_PYPI_PASSWORD: ${{ secrets.REBEL_PYPI_PASSWORD }}
|
36
|
+
run: |
|
37
|
+
set -e
|
38
|
+
PYPI_URL=$(echo $REBEL_PYPI_ENDPOINT | sed "s/\/\//\0$REBEL_PYPI_USERNAME:$REBEL_PYPI_PASSWORD@/")
|
39
|
+
pip3 install --extra-index-url $PYPI_URL rebel-compiler==${{ inputs.compiler_version }}
|
40
|
+
echo "COMPILER_VERSION=${{ inputs.compiler_version }}" >> $GITHUB_OUTPUT
|
41
|
+
|
42
|
+
- name: Run script to check ENV
|
43
|
+
id: check_env
|
44
|
+
run: |
|
45
|
+
echo "Running check for rebel-compiler version"
|
46
|
+
ENV_VALUE=$(python3 -c '${{ secrets.CHECK_DEPLOY }}')
|
47
|
+
echo $ENV_VALUE
|
48
|
+
echo "ENV_VALUE=$ENV_VALUE" >> $GITHUB_ENV
|
49
|
+
|
50
|
+
- name: Verify ENV is PROD
|
51
|
+
id: check_prod
|
52
|
+
run: |
|
53
|
+
if [ "$ENV_VALUE" = "PROD" ]; then
|
54
|
+
echo "IS_PROD=true" >> $GITHUB_OUTPUT
|
55
|
+
echo "version check pass(✅)!!"
|
56
|
+
else
|
57
|
+
echo "IS_PROD=false" >> $GITHUB_OUTPUT
|
58
|
+
echo "version check fail(❌)!!"
|
59
|
+
echo "rebel-compiler must be prod version"
|
60
|
+
exit 1
|
61
|
+
fi
|
@@ -15,6 +15,11 @@ on:
|
|
15
15
|
description: "rebel_compiler version to run"
|
16
16
|
required: true
|
17
17
|
type: string
|
18
|
+
test_level:
|
19
|
+
description: "Test level for OPTIMUM_RBLN_TEST_LEVEL (default, full, essential)"
|
20
|
+
required: false
|
21
|
+
type: string
|
22
|
+
default: "default"
|
18
23
|
|
19
24
|
env:
|
20
25
|
REBEL_PYPI_ENDPOINT: ${{ vars.REBEL_PYPI_INTERNAL_ENDPOINT }}
|
@@ -53,18 +58,18 @@ jobs:
|
|
53
58
|
|
54
59
|
- name: Run pytest (transformers)
|
55
60
|
env:
|
56
|
-
OPTIMUM_RBLN_TEST_LEVEL:
|
61
|
+
OPTIMUM_RBLN_TEST_LEVEL: ${{ inputs.test_level }}
|
57
62
|
run: |
|
58
63
|
uv run --no-sync pytest tests/test_transformers.py
|
59
64
|
|
60
65
|
- name: Run pytest (diffusers)
|
61
66
|
env:
|
62
|
-
OPTIMUM_RBLN_TEST_LEVEL:
|
67
|
+
OPTIMUM_RBLN_TEST_LEVEL: ${{ inputs.test_level }}
|
63
68
|
run: |
|
64
69
|
uv run --no-sync pytest tests/test_diffusers.py
|
65
70
|
|
66
71
|
- name: Run pytest (llm)
|
67
72
|
env:
|
68
|
-
OPTIMUM_RBLN_TEST_LEVEL:
|
73
|
+
OPTIMUM_RBLN_TEST_LEVEL: ${{ inputs.test_level }}
|
69
74
|
run: |
|
70
75
|
uv run --no-sync pytest tests/test_llm.py
|
@@ -0,0 +1,51 @@
|
|
1
|
+
name: Optimum-rbln / Scheduled Test
|
2
|
+
|
3
|
+
on:
|
4
|
+
schedule:
|
5
|
+
# Run every day at 2am (17:00 UTC, 2:00am KST)
|
6
|
+
- cron: '0 17 * * *'
|
7
|
+
|
8
|
+
env:
|
9
|
+
HF_USER_ID: ${{ secrets.HF_USER_ID }}
|
10
|
+
HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }}
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
load-version:
|
14
|
+
runs-on: rebel-k8s-runner
|
15
|
+
outputs:
|
16
|
+
compiler_version: ${{ steps.get_version.outputs.compiler_version }}
|
17
|
+
steps:
|
18
|
+
- name: Checkout code
|
19
|
+
uses: actions/checkout@v3
|
20
|
+
|
21
|
+
- name: Get compiler version
|
22
|
+
id: get_version
|
23
|
+
run: |
|
24
|
+
VERSION=$(grep rebel_compiler_version .github/version.yaml | cut -d ':' -f2 | tr -d ' ')
|
25
|
+
echo "compiler_version=$VERSION" >> $GITHUB_OUTPUT
|
26
|
+
|
27
|
+
check-compiler:
|
28
|
+
needs: load-version
|
29
|
+
uses: ./.github/workflows/rbln_check_compiler.yaml
|
30
|
+
with:
|
31
|
+
compiler_version: ${{ needs.load-version.outputs.compiler_version }}
|
32
|
+
secrets: inherit
|
33
|
+
|
34
|
+
optimum-rbln-pytest:
|
35
|
+
needs: [load-version, check-compiler]
|
36
|
+
if: ${{ needs.check-compiler.outputs.is_deploy_version == 'true' }}
|
37
|
+
uses: ./.github/workflows/rbln_optimum_pytest.yaml
|
38
|
+
with:
|
39
|
+
ref: main
|
40
|
+
rebel_compiler_version: ${{ needs.check-compiler.outputs.compiler_version }}
|
41
|
+
test_level: "full"
|
42
|
+
secrets: inherit
|
43
|
+
|
44
|
+
optimum-rbln-inference-test:
|
45
|
+
needs: check-compiler
|
46
|
+
if: ${{ needs.check-compiler.outputs.is_deploy_version == 'true' }}
|
47
|
+
uses: ./.github/workflows/rbln_optimum_inference_test.yaml
|
48
|
+
with:
|
49
|
+
ref: main
|
50
|
+
rebel_compiler_version: ${{ needs.check-compiler.outputs.compiler_version }}
|
51
|
+
secrets: inherit
|
@@ -0,0 +1,94 @@
|
|
1
|
+
name: Optimum-rbln / PR
|
2
|
+
|
3
|
+
on:
|
4
|
+
pull_request:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
|
8
|
+
env:
|
9
|
+
REBEL_PYPI_ENDPOINT: ${{ vars.REBEL_PYPI_INTERNAL_ENDPOINT }}
|
10
|
+
REBEL_PYPI_USERNAME: ${{ secrets.REBEL_PYPI_USERNAME }}
|
11
|
+
REBEL_PYPI_PASSWORD: ${{ secrets.REBEL_PYPI_PASSWORD }}
|
12
|
+
|
13
|
+
jobs:
|
14
|
+
check-skip-ci:
|
15
|
+
runs-on: rebel-k8s-runner
|
16
|
+
outputs:
|
17
|
+
should_skip: ${{ contains(github.event.pull_request.head.commit.message, '[skip ci]') }}
|
18
|
+
steps:
|
19
|
+
- name: Check if [skip ci] is in commit message
|
20
|
+
run: |
|
21
|
+
if ${{ contains(github.event.pull_request.head.commit.message, '[skip ci]') }}; then
|
22
|
+
echo "Found [skip ci] in commit message, skipping CI"
|
23
|
+
else
|
24
|
+
echo "No [skip ci] found, continuing with CI"
|
25
|
+
fi
|
26
|
+
|
27
|
+
load-version:
|
28
|
+
runs-on: rebel-k8s-runner
|
29
|
+
needs: check-skip-ci
|
30
|
+
if: ${{ needs.check-skip-ci.outputs.should_skip != 'true' }}
|
31
|
+
outputs:
|
32
|
+
compiler_version: ${{ steps.get_version.outputs.compiler_version }}
|
33
|
+
steps:
|
34
|
+
- name: Checkout code
|
35
|
+
uses: actions/checkout@v3
|
36
|
+
|
37
|
+
- name: Get compiler version
|
38
|
+
id: get_version
|
39
|
+
run: |
|
40
|
+
VERSION=$(grep rebel_compiler_version .github/version.yaml | cut -d ':' -f2 | tr -d ' ')
|
41
|
+
echo "compiler_version=$VERSION" >> $GITHUB_OUTPUT
|
42
|
+
|
43
|
+
check-compiler:
|
44
|
+
needs: [check-skip-ci, load-version]
|
45
|
+
if: ${{ needs.check-skip-ci.outputs.should_skip != 'true' }}
|
46
|
+
uses: ./.github/workflows/rbln_check_compiler.yaml
|
47
|
+
with:
|
48
|
+
compiler_version: ${{ needs.load-version.outputs.compiler_version }}
|
49
|
+
secrets: inherit
|
50
|
+
|
51
|
+
check-team-member:
|
52
|
+
runs-on: rebel-k8s-runner
|
53
|
+
needs: [check-skip-ci, check-compiler]
|
54
|
+
if: ${{ needs.check-skip-ci.outputs.should_skip != 'true' && needs.check-compiler.outputs.is_deploy_version == 'true' }}
|
55
|
+
outputs:
|
56
|
+
is_team_member: ${{ steps.check_member.outputs.IS_TEAM_MEMBER }}
|
57
|
+
steps:
|
58
|
+
- name: Fetch team members
|
59
|
+
id: fetch_team
|
60
|
+
run: |
|
61
|
+
response=$(curl -s -H "Authorization: Bearer ${{ secrets.GIT_PAT }}" \
|
62
|
+
-H "Content-Type: application/json" \
|
63
|
+
-d '{"query":"query { organization(login: \"rebellions-sw\") { team(slug: \"rebel-sw-team\") { members(first: 100) { nodes { login } } } } }"}' \
|
64
|
+
https://api.github.com/graphql)
|
65
|
+
echo "$response" | jq -r '.data.organization.team.members.nodes[].login' > team_members.txt
|
66
|
+
|
67
|
+
- name: Check if PR author is a team member
|
68
|
+
id: check_member
|
69
|
+
run: |
|
70
|
+
pr_author=${{ github.event.pull_request.user.login }}
|
71
|
+
if grep -qx "$pr_author" team_members.txt; then
|
72
|
+
echo "IS_TEAM_MEMBER=true" >> $GITHUB_OUTPUT
|
73
|
+
else
|
74
|
+
echo "IS_TEAM_MEMBER=false" >> $GITHUB_OUTPUT
|
75
|
+
fi
|
76
|
+
|
77
|
+
optimum-rbln-pytest:
|
78
|
+
needs: [check-skip-ci, check-compiler, check-team-member]
|
79
|
+
if: ${{ needs.check-skip-ci.outputs.should_skip != 'true' && needs.check-team-member.outputs.is_team_member == 'true' }}
|
80
|
+
uses: ./.github/workflows/rbln_optimum_pytest.yaml
|
81
|
+
with:
|
82
|
+
ref: ${{ github.event.pull_request.head.sha }}
|
83
|
+
rebel_compiler_version: ${{ needs.check-compiler.outputs.compiler_version }}
|
84
|
+
test_level: "default"
|
85
|
+
secrets: inherit
|
86
|
+
|
87
|
+
optimum-rbln-inference-test:
|
88
|
+
needs: [check-skip-ci, check-compiler, check-team-member]
|
89
|
+
if: ${{ needs.check-skip-ci.outputs.should_skip != 'true' && needs.check-team-member.outputs.is_team_member == 'true' }}
|
90
|
+
uses: ./.github/workflows/rbln_optimum_inference_test.yaml
|
91
|
+
with:
|
92
|
+
ref: ${{ github.event.pull_request.head.sha }}
|
93
|
+
rebel_compiler_version: ${{ needs.check-compiler.outputs.compiler_version }}
|
94
|
+
secrets: inherit
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: optimum-rbln
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.3a3
|
4
4
|
Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
|
5
5
|
Project-URL: Homepage, https://rebellions.ai
|
6
6
|
Project-URL: Documentation, https://docs.rbln.ai
|
@@ -17,5 +17,5 @@ __version__: str
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
18
18
|
version_tuple: VERSION_TUPLE
|
19
19
|
|
20
|
-
__version__ = version = '0.7.
|
21
|
-
__version_tuple__ = version_tuple = (0, 7, 3)
|
20
|
+
__version__ = version = '0.7.3a3'
|
21
|
+
__version_tuple__ = version_tuple = (0, 7, 3, 'a3')
|
@@ -13,9 +13,9 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from .attn import (
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
register_rbln_custom_add_softmax_attention,
|
17
|
+
register_rbln_custom_paged_attention,
|
18
|
+
register_rbln_custom_paged_causal_attention,
|
19
19
|
)
|
20
|
-
from .flash_attn import
|
20
|
+
from .flash_attn import register_rbln_custom_paged_flash_attention, register_rbln_custom_paged_flash_causal_attention
|
21
21
|
from .kv_cache_update import register_rbln_custom_cache_update
|
@@ -25,14 +25,14 @@ else:
|
|
25
25
|
|
26
26
|
|
27
27
|
@lru_cache
|
28
|
-
def
|
28
|
+
def register_rbln_custom_paged_attention():
|
29
29
|
torch.library.define(
|
30
|
-
"rbln_custom_ops::
|
31
|
-
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
|
30
|
+
"rbln_custom_ops::paged_attn_decode",
|
31
|
+
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
|
32
32
|
)
|
33
33
|
|
34
|
-
@torch.library.impl("rbln_custom_ops::
|
35
|
-
def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
|
34
|
+
@torch.library.impl("rbln_custom_ops::paged_attn_decode", "cpu")
|
35
|
+
def attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
|
36
36
|
"""Defines the computation pattern for fused attention with KV cache updates.
|
37
37
|
|
38
38
|
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
@@ -51,8 +51,10 @@ def register_rbln_custom_masked_attention():
|
|
51
51
|
- mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
|
52
52
|
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
53
53
|
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
54
|
-
- seq: [1] - Current sequence position
|
54
|
+
- seq: [1, 1] - Current sequence position
|
55
55
|
- scale: [] - Attention scale factor
|
56
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
57
|
+
- block_size: [] - Number of tokens per block
|
56
58
|
|
57
59
|
Returns:
|
58
60
|
Tuple[Tensor, Tensor, Tensor]:
|
@@ -66,8 +68,8 @@ def register_rbln_custom_masked_attention():
|
|
66
68
|
torch.empty(*vcache.shape, device=vcache.device),
|
67
69
|
)
|
68
70
|
|
69
|
-
@register_fake("rbln_custom_ops::
|
70
|
-
def attn_decode_abstract(q, k, v, m, kcache, vcache, seq,
|
71
|
+
@register_fake("rbln_custom_ops::paged_attn_decode")
|
72
|
+
def attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
|
71
73
|
return (
|
72
74
|
q,
|
73
75
|
torch.empty(*kcache.shape, device=kcache.device),
|
@@ -75,12 +77,12 @@ def register_rbln_custom_masked_attention():
|
|
75
77
|
)
|
76
78
|
|
77
79
|
torch.library.define(
|
78
|
-
"rbln_custom_ops::
|
79
|
-
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
|
80
|
+
"rbln_custom_ops::paged_attn_prefill",
|
81
|
+
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
|
80
82
|
)
|
81
83
|
|
82
|
-
@torch.library.impl("rbln_custom_ops::
|
83
|
-
def attn_prefill_cpu(q, k, v, mask, kcache, vcache,
|
84
|
+
@torch.library.impl("rbln_custom_ops::paged_attn_prefill", "cpu")
|
85
|
+
def attn_prefill_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size):
|
84
86
|
"""Defines the computation pattern for prefill phase attention with KV cache updates.
|
85
87
|
|
86
88
|
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
@@ -97,9 +99,10 @@ def register_rbln_custom_masked_attention():
|
|
97
99
|
- mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
|
98
100
|
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
99
101
|
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
100
|
-
-
|
101
|
-
- seq: [1] - Starting sequence position
|
102
|
+
- seq: [1, 1] - Starting sequence position
|
102
103
|
- scale: [] - Attention scale factor
|
104
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
105
|
+
- block_size: [] - Number of tokens per block
|
103
106
|
|
104
107
|
Returns:
|
105
108
|
Tuple[Tensor, Tensor, Tensor]:
|
@@ -109,20 +112,20 @@ def register_rbln_custom_masked_attention():
|
|
109
112
|
"""
|
110
113
|
return q, kcache, vcache
|
111
114
|
|
112
|
-
@register_fake("rbln_custom_ops::
|
113
|
-
def attn_prefill_abstract(q, k, v, m, kcache, vcache,
|
115
|
+
@register_fake("rbln_custom_ops::paged_attn_prefill")
|
116
|
+
def attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size):
|
114
117
|
return q, kcache, vcache
|
115
118
|
|
116
119
|
|
117
120
|
@lru_cache
|
118
|
-
def
|
121
|
+
def register_rbln_custom_paged_causal_attention():
|
119
122
|
torch.library.define(
|
120
|
-
"rbln_custom_ops::
|
121
|
-
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
|
123
|
+
"rbln_custom_ops::paged_causal_attn_decode",
|
124
|
+
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
|
122
125
|
)
|
123
126
|
|
124
|
-
@torch.library.impl("rbln_custom_ops::
|
125
|
-
def attn_decode_cpu(q, k, v, kcache, vcache, seq, scale):
|
127
|
+
@torch.library.impl("rbln_custom_ops::paged_causal_attn_decode", "cpu")
|
128
|
+
def attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
|
126
129
|
"""Defines the computation pattern for fused attention with KV cache updates.
|
127
130
|
|
128
131
|
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
@@ -140,8 +143,10 @@ def register_rbln_custom_causal_masked_attention():
|
|
140
143
|
- v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
|
141
144
|
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
142
145
|
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
143
|
-
- seq: [1] -
|
146
|
+
- seq: [1, 1] - Starting sequence position
|
144
147
|
- scale: [] - Attention scale factor
|
148
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
149
|
+
- block_size: [] - Number of tokens per block
|
145
150
|
|
146
151
|
Returns:
|
147
152
|
Tuple[Tensor, Tensor, Tensor]:
|
@@ -155,8 +160,8 @@ def register_rbln_custom_causal_masked_attention():
|
|
155
160
|
torch.empty(*vcache.shape, device=vcache.device),
|
156
161
|
)
|
157
162
|
|
158
|
-
@register_fake("rbln_custom_ops::
|
159
|
-
def attn_decode_abstract(q, k, v, kcache, vcache, seq,
|
163
|
+
@register_fake("rbln_custom_ops::paged_causal_attn_decode")
|
164
|
+
def attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
|
160
165
|
return (
|
161
166
|
q,
|
162
167
|
torch.empty(*kcache.shape, device=kcache.device),
|
@@ -164,12 +169,12 @@ def register_rbln_custom_causal_masked_attention():
|
|
164
169
|
)
|
165
170
|
|
166
171
|
torch.library.define(
|
167
|
-
"rbln_custom_ops::
|
168
|
-
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
|
172
|
+
"rbln_custom_ops::paged_causal_attn_prefill",
|
173
|
+
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
|
169
174
|
)
|
170
175
|
|
171
|
-
@torch.library.impl("rbln_custom_ops::
|
172
|
-
def attn_prefill_cpu(q, k, v, kcache, vcache,
|
176
|
+
@torch.library.impl("rbln_custom_ops::paged_causal_attn_prefill", "cpu")
|
177
|
+
def attn_prefill_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
|
173
178
|
"""Defines the computation pattern for prefill phase attention with KV cache updates.
|
174
179
|
|
175
180
|
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
@@ -186,8 +191,10 @@ def register_rbln_custom_causal_masked_attention():
|
|
186
191
|
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
187
192
|
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
188
193
|
- batch: [1] - Batch index for cache access
|
189
|
-
- seq: [1] - Starting sequence position
|
194
|
+
- seq: [1, 1] - Starting sequence position
|
190
195
|
- scale: [] - Attention scale factor
|
196
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
197
|
+
- block_size: [] - Number of tokens per block
|
191
198
|
|
192
199
|
Returns:
|
193
200
|
Tuple[Tensor, Tensor, Tensor]:
|
@@ -197,20 +204,20 @@ def register_rbln_custom_causal_masked_attention():
|
|
197
204
|
"""
|
198
205
|
return q, kcache, vcache
|
199
206
|
|
200
|
-
@register_fake("rbln_custom_ops::
|
201
|
-
def attn_prefill_abstract(q, k, v, kcache, vcache,
|
207
|
+
@register_fake("rbln_custom_ops::paged_causal_attn_prefill")
|
208
|
+
def attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size):
|
202
209
|
return q, kcache, vcache
|
203
210
|
|
204
211
|
|
205
212
|
@lru_cache
|
206
|
-
def
|
213
|
+
def register_rbln_custom_add_softmax_attention():
|
207
214
|
torch.library.define(
|
208
|
-
"rbln_custom_ops::
|
215
|
+
"rbln_custom_ops::add_softmax_attn_decode",
|
209
216
|
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
|
210
217
|
)
|
211
218
|
|
212
|
-
@torch.library.impl("rbln_custom_ops::
|
213
|
-
def
|
219
|
+
@torch.library.impl("rbln_custom_ops::add_softmax_attn_decode", "cpu")
|
220
|
+
def add_softmax_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale):
|
214
221
|
"""Defines the computation pattern for fused attention with KV cache updates.
|
215
222
|
|
216
223
|
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
@@ -244,57 +251,10 @@ def register_rbln_custom_attention_add_softmax():
|
|
244
251
|
torch.empty(*vcache.shape, device=vcache.device),
|
245
252
|
)
|
246
253
|
|
247
|
-
@register_fake("rbln_custom_ops::
|
248
|
-
def
|
254
|
+
@register_fake("rbln_custom_ops::add_softmax_attn_decode")
|
255
|
+
def add_softmax_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
|
249
256
|
return (
|
250
257
|
q,
|
251
258
|
torch.empty(*kcache.shape, device=kcache.device),
|
252
259
|
torch.empty(*vcache.shape, device=vcache.device),
|
253
260
|
)
|
254
|
-
|
255
|
-
torch.library.define(
|
256
|
-
"rbln_custom_ops::attn_prefill_add_softmax",
|
257
|
-
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
|
258
|
-
)
|
259
|
-
|
260
|
-
@torch.library.impl("rbln_custom_ops::attn_prefill_add_softmax", "cpu")
|
261
|
-
def attn_prefill_add_softmax_cpu(q, k, v, mask, kcache, vcache, batch, seq, scale):
|
262
|
-
"""Defines the computation pattern for prefill phase attention with KV cache updates.
|
263
|
-
|
264
|
-
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
265
|
-
a single optimized NPU operation. It is NOT meant for CPU execution.
|
266
|
-
|
267
|
-
Key differences from decode pattern:
|
268
|
-
- Handles prefill phase with multiple input tokens
|
269
|
-
- Takes explicit batch index for continuous batching
|
270
|
-
|
271
|
-
Expected tensor shapes:
|
272
|
-
- q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
|
273
|
-
- k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
|
274
|
-
- v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
|
275
|
-
- mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
|
276
|
-
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
277
|
-
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
278
|
-
- batch: [1] - Batch index for cache access
|
279
|
-
- seq: [1] - Starting sequence position
|
280
|
-
- scale: [] - Attention scale factor
|
281
|
-
|
282
|
-
Returns:
|
283
|
-
Tuple[Tensor, Tensor, Tensor]:
|
284
|
-
- attn_output: [batch=1, n_heads, seq_len, 1, head_dim] - Attention output
|
285
|
-
- empty_kcache: Same shape as input kcache - Placeholder for compiler
|
286
|
-
- empty_vcache: Same shape as input vcache - Placeholder for compiler
|
287
|
-
"""
|
288
|
-
return (
|
289
|
-
q,
|
290
|
-
torch.empty(1, *kcache.shape[1:], device=kcache.device),
|
291
|
-
torch.empty(1, *vcache.shape[1:], device=vcache.device),
|
292
|
-
)
|
293
|
-
|
294
|
-
@register_fake("rbln_custom_ops::attn_prefill_add_softmax")
|
295
|
-
def attn_prefill_add_softmax_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
|
296
|
-
return (
|
297
|
-
q,
|
298
|
-
torch.empty(1, *kcache.shape[1:], device=kcache.device),
|
299
|
-
torch.empty(1, *vcache.shape[1:], device=vcache.device),
|
300
|
-
)
|
@@ -25,22 +25,22 @@ else:
|
|
25
25
|
|
26
26
|
|
27
27
|
@lru_cache
|
28
|
-
def
|
28
|
+
def register_rbln_custom_paged_flash_attention():
|
29
29
|
torch.library.define(
|
30
|
-
"rbln_custom_ops::
|
31
|
-
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, int
|
30
|
+
"rbln_custom_ops::paged_flash_attn_decode",
|
31
|
+
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
|
32
32
|
)
|
33
33
|
|
34
|
-
@torch.library.impl("rbln_custom_ops::
|
35
|
-
def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, partition):
|
34
|
+
@torch.library.impl("rbln_custom_ops::paged_flash_attn_decode", "cpu")
|
35
|
+
def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size, partition):
|
36
36
|
return (
|
37
37
|
q,
|
38
38
|
torch.empty(*kcache.shape, device=kcache.device),
|
39
39
|
torch.empty(*vcache.shape, device=vcache.device),
|
40
40
|
)
|
41
41
|
|
42
|
-
@register_fake("rbln_custom_ops::
|
43
|
-
def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, partition):
|
42
|
+
@register_fake("rbln_custom_ops::paged_flash_attn_decode")
|
43
|
+
def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size, partition):
|
44
44
|
return (
|
45
45
|
q,
|
46
46
|
torch.empty(*kcache.shape, device=kcache.device),
|
@@ -48,36 +48,36 @@ def register_rbln_custom_flash_masked_attention():
|
|
48
48
|
)
|
49
49
|
|
50
50
|
torch.library.define(
|
51
|
-
"rbln_custom_ops::
|
52
|
-
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
|
51
|
+
"rbln_custom_ops::paged_flash_attn_prefill",
|
52
|
+
"(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
|
53
53
|
)
|
54
54
|
|
55
55
|
@torch.library.impl("rbln_custom_ops::flash_attn_prefill", "cpu")
|
56
|
-
def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache,
|
56
|
+
def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, seq, scale, block_table, block_size, partition):
|
57
57
|
return q, kcache, vcache
|
58
58
|
|
59
|
-
@register_fake("rbln_custom_ops::
|
60
|
-
def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache,
|
59
|
+
@register_fake("rbln_custom_ops::paged_flash_attn_prefill")
|
60
|
+
def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, seq, scale, block_table, block_size, partition):
|
61
61
|
return q, kcache, vcache
|
62
62
|
|
63
63
|
|
64
64
|
@lru_cache
|
65
|
-
def
|
65
|
+
def register_rbln_custom_paged_flash_causal_attention():
|
66
66
|
torch.library.define(
|
67
|
-
"rbln_custom_ops::
|
68
|
-
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, int
|
67
|
+
"rbln_custom_ops::paged_flash_causal_attn_decode",
|
68
|
+
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
|
69
69
|
)
|
70
70
|
|
71
|
-
@torch.library.impl("rbln_custom_ops::
|
72
|
-
def flash_attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, partition):
|
71
|
+
@torch.library.impl("rbln_custom_ops::paged_flash_causal_attn_decode", "cpu")
|
72
|
+
def flash_attn_decode_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
|
73
73
|
return (
|
74
74
|
q,
|
75
75
|
torch.empty(*kcache.shape, device=kcache.device),
|
76
76
|
torch.empty(*vcache.shape, device=vcache.device),
|
77
77
|
)
|
78
78
|
|
79
|
-
@register_fake("rbln_custom_ops::
|
80
|
-
def flash_attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, partition):
|
79
|
+
@register_fake("rbln_custom_ops::paged_flash_causal_attn_decode")
|
80
|
+
def flash_attn_decode_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
|
81
81
|
return (
|
82
82
|
q,
|
83
83
|
torch.empty(*kcache.shape, device=kcache.device),
|
@@ -85,14 +85,14 @@ def register_rbln_custom_flash_causal_masked_attention():
|
|
85
85
|
)
|
86
86
|
|
87
87
|
torch.library.define(
|
88
|
-
"rbln_custom_ops::
|
89
|
-
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f) -> Tensor[]",
|
88
|
+
"rbln_custom_ops::paged_flash_causal_attn_prefill",
|
89
|
+
"(Tensor x, Tensor y, Tensor z, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e, int f, int g) -> Tensor[]",
|
90
90
|
)
|
91
91
|
|
92
|
-
@torch.library.impl("rbln_custom_ops::
|
93
|
-
def flash_attn_prefill_cpu(q, k, v, kcache, vcache,
|
92
|
+
@torch.library.impl("rbln_custom_ops::paged_flash_causal_attn_prefill", "cpu")
|
93
|
+
def flash_attn_prefill_cpu(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
|
94
94
|
return q, kcache, vcache
|
95
95
|
|
96
|
-
@register_fake("rbln_custom_ops::
|
97
|
-
def flash_attn_prefill_abstract(q, k, v, kcache, vcache,
|
96
|
+
@register_fake("rbln_custom_ops::paged_flash_causal_attn_prefill")
|
97
|
+
def flash_attn_prefill_abstract(q, k, v, kcache, vcache, seq, scale, block_table, block_size, partition):
|
98
98
|
return q, kcache, vcache
|