optimum-rbln 0.7.4a1__tar.gz → 0.7.4a3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/scripts/auto_code_review.py +4 -22
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/auto_code_review.yml +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/check_code_quality.yml +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/deploy-on-tag.yaml +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/pr-title-check.yaml +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/pr_checklist_validator.yml +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/rbln_optimum_pytest.yaml +3 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/PKG-INFO +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/__version__.py +1 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/modeling.py +8 -1
- optimum_rbln-0.7.4a3/src/optimum/rbln/ops/__init__.py +18 -0
- optimum_rbln-0.7.4a3/src/optimum/rbln/ops/attn.py +287 -0
- optimum_rbln-0.7.4a3/src/optimum/rbln/ops/flash_attn.py +176 -0
- optimum_rbln-0.7.4a3/src/optimum/rbln/ops/kv_cache_update.py +24 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/bart/__init__.py +1 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/decoderonly/__init__.py +10 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py +80 -94
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py +39 -20
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py +17 -13
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py +12 -21
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/t5/__init__.py +1 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/t5/t5_architecture.py +3 -4
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/time_series_transformers/__init__.py +1 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/time_series_transformers/time_series_transformers_architecture.py +12 -22
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/whisper/__init__.py +1 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/whisper/modeling_whisper.py +0 -1
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/whisper/whisper_architecture.py +22 -34
- optimum_rbln-0.7.4a1/src/optimum/rbln/ops/__init__.py +0 -22
- optimum_rbln-0.7.4a1/src/optimum/rbln/ops/attn.py +0 -223
- optimum_rbln-0.7.4a1/src/optimum/rbln/ops/flash_attn.py +0 -82
- optimum_rbln-0.7.4a1/src/optimum/rbln/ops/kv_cache_update.py +0 -60
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/ISSUE_TEMPLATE/model_request.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/pull_request_template.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/scripts/validate_pr_checklist.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/version.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/deploy.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/rbln_check_compiler.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/rbln_dispatch_pytest.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/rbln_optimum_inference_test.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/rbln_scheduled_test.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.github/workflows/rbln_trigger_on_pr.yaml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/.gitignore +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/CODE_OF_CONDUCT.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/CONTRIBUTING.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/LICENSE +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/README.md +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/assets/rbln_logo.png +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/audio-classification/run_ast_audio_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/depth-estimation/run_dpt.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/image-classification/run_image_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/image-classification/run_vit_image_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/image-to-text/run_llava_next_image_to_text.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2_combined.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2_img2img_combined.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2_inpaint.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2_inpaint_combined.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/kandinsky2_2/run_kandinsky2_2_prior_interpolate.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/question-answering/run_question_answering.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/speech-recognition/run_wav2vec2.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/speech-recognition/run_whisper.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion_controlnet.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion_img2img_controlnet.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion_inpaint.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion_lora.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/stable-diffusion/run_stable_diffusion_multicontrolnet.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text-classification/run_bge_m3_text_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text-classification/run_bge_reranker_v2_m3_text_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text-classification/run_secureBERT.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text-classification/run_t5_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text-classification/run_twitter_roberta_text_classification.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text2text-generation/run_bart_text2text_generation.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text2text-generation/run_llama_peft.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/text2text-generation/run_llama_text2text_generation.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/examples/time-series-forecasting/run_time_series_forecasting.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/pyproject.toml +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/scripts/uv-lock.sh +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/scripts/uv-sync.sh +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/modeling_diffusers.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/autoencoders/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/autoencoders/autoencoder_kl.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/autoencoders/vae.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/autoencoders/vq_model.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/controlnet.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/transformers/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/transformers/prior_transformer.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/transformers/transformer_sd3.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/unets/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/models/unets/unet_2d_condition.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/controlnet/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/controlnet/multicontrolnet.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/modeling_base.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/modeling_config.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/ops/linear.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/modeling_alias.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/modeling_generic.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/modeling_rope_utils.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/auto/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/auto/auto_factory.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/auto/modeling_auto.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/bart/bart_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/bart/modeling_bart.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/bert/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/bert/modeling_bert.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/clip/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/clip/modeling_clip.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/dpt/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/dpt/modeling_dpt.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/exaone/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/exaone/exaone_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/exaone/modeling_exaone.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/gemma/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/gemma/gemma_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/gemma/modeling_gemma.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/gpt2/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/gpt2/gpt2_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/gpt2/modeling_gpt2.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/llama/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/llama/llama_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/llama/modeling_llama.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/llava_next/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/llava_next/modeling_llava_next.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/midm/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/midm/midm_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/midm/modeling_midm.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/mistral/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/mistral/mistral_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/mistral/modeling_mistral.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/phi/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/phi/modeling_phi.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/phi/phi_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/qwen2/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/qwen2/modeling_qwen2.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/qwen2/qwen2_architecture.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/seq2seq/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/t5/modeling_t5.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/time_series_transformers/modeling_time_series_transformers.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/wav2vec2/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/whisper/generation_whisper.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/xlm_roberta/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/utils/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/transformers/utils/rbln_quantization.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/decorator_utils.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/hub.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/import_utils.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/logging.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/model_utils.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/runtime_utils.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/save_utils.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/src/optimum/rbln/utils/submodule.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/__init__.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/psnr.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/requirements_sdxl.txt +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/run_stable_diffusion_xl_base.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/test_base.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/test_diffusers.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/test_llm.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/tests/test_transformers.py +0 -0
- {optimum_rbln-0.7.4a1 → optimum_rbln-0.7.4a3}/uv.lock +0 -0
@@ -22,8 +22,7 @@ from github import Github
|
|
22
22
|
|
23
23
|
model_name = os.environ["GOOGLE_MODEL_ID"]
|
24
24
|
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
|
25
|
-
max_context_token =
|
26
|
-
force_review = False
|
25
|
+
max_context_token = 500000
|
27
26
|
|
28
27
|
|
29
28
|
def get_pr_diff():
|
@@ -58,19 +57,6 @@ Review the following code changes(GIT DIFF) along with the pull request (PR) det
|
|
58
57
|
return system_prompt, prompt
|
59
58
|
|
60
59
|
|
61
|
-
def translate_review(review):
|
62
|
-
model = genai.GenerativeModel(
|
63
|
-
model_name,
|
64
|
-
system_instruction="You are a professional translator specializing in technical and software-related content. Keep the technical words in English, but understand the whole sentence and rephrase it in Korean.",
|
65
|
-
)
|
66
|
-
prompt = f"""Translate the following English text into Korean, maintaining technical accuracy and clarity. Include ONLY the translation, NO OTHER EXPLANATIONS or RESPONSES as a chatbot. :
|
67
|
-
|
68
|
-
{review}"""
|
69
|
-
response = model.generate_content(prompt)
|
70
|
-
|
71
|
-
return response.text
|
72
|
-
|
73
|
-
|
74
60
|
def review_code(system_prompt, prompt):
|
75
61
|
model = genai.GenerativeModel(model_name, system_instruction=system_prompt)
|
76
62
|
response = model.generate_content(prompt)
|
@@ -125,7 +111,7 @@ def main():
|
|
125
111
|
system_prompt, prompt = get_prompt(diff, pr)
|
126
112
|
model = genai.GenerativeModel(model_name=model_name, system_instruction=system_prompt)
|
127
113
|
num_tokens = model.count_tokens(prompt).total_tokens
|
128
|
-
if num_tokens > max_context_token
|
114
|
+
if num_tokens > max_context_token:
|
129
115
|
msg = f"Diff ({len(diff)}) exceeds maximum allowed tokens ({max_context_token}) > ({num_tokens}). Skipping review."
|
130
116
|
print(msg)
|
131
117
|
pr.create_issue_comment(msg)
|
@@ -133,14 +119,10 @@ def main():
|
|
133
119
|
|
134
120
|
# Get Auto review
|
135
121
|
review = review_code(system_prompt, prompt)
|
136
|
-
translation = translate_review(review)
|
137
122
|
|
138
123
|
# Post comment on PR
|
139
|
-
pr.create_issue_comment(f"""# Auto Code Review
|
140
|
-
|
141
|
-
- [참고] Auto Code Review를 invoke하려면, commit message의 시작을 [autoreview]로 시작하거나, "/autoreview" 를 comment로 작성한 후,
|
142
|
-
해당 commit의 github action에서 code review를 re-run 하시면 됩니다.
|
143
|
-
\n\n{review}\n\n{translation}""")
|
124
|
+
pr.create_issue_comment(f"""# Auto Code Review by {model_name}
|
125
|
+
\n\n{review}""")
|
144
126
|
|
145
127
|
|
146
128
|
if __name__ == "__main__":
|
@@ -43,7 +43,9 @@ jobs:
|
|
43
43
|
if: ${{ inputs.commit_message == '' }}
|
44
44
|
run: |
|
45
45
|
COMMIT_MESSAGE=$(git log -1 --pretty=%B)
|
46
|
-
echo "message
|
46
|
+
echo "message<<EOF" >> $GITHUB_OUTPUT
|
47
|
+
echo "$COMMIT_MESSAGE" >> $GITHUB_OUTPUT
|
48
|
+
echo "EOF" >> $GITHUB_OUTPUT
|
47
49
|
|
48
50
|
- name: Setup uv
|
49
51
|
uses: astral-sh/setup-uv@v3
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: optimum-rbln
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.4a3
|
4
4
|
Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
|
5
5
|
Project-URL: Homepage, https://rebellions.ai
|
6
6
|
Project-URL: Documentation, https://docs.rbln.ai
|
@@ -123,8 +123,15 @@ class RBLNModel(RBLNBaseModel):
|
|
123
123
|
config = AutoConfig.from_pretrained(config._name_or_path, **kwargs)
|
124
124
|
|
125
125
|
if hasattr(model, "can_generate") and model.can_generate():
|
126
|
+
import json
|
127
|
+
|
126
128
|
generation_config = model.generation_config
|
127
|
-
|
129
|
+
generation_config_path = save_dir_path / subfolder / "generation_config.json"
|
130
|
+
|
131
|
+
generation_config.save_pretrained(generation_config_path.parent)
|
132
|
+
local_config = json.loads(generation_config_path.read_text(encoding="utf-8"))
|
133
|
+
local_config["transformers_version"] = generation_config.transformers_version
|
134
|
+
generation_config_path.write_text(json.dumps(local_config, indent=2) + "\n", encoding="utf-8")
|
128
135
|
|
129
136
|
if not isinstance(config, PretrainedConfig): # diffusers config
|
130
137
|
config = PretrainedConfig(**config)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from .attn import *
|
16
|
+
from .flash_attn import *
|
17
|
+
from .kv_cache_update import *
|
18
|
+
from .linear import linear
|
@@ -0,0 +1,287 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
|
16
|
+
import torch
|
17
|
+
from torch import Tensor
|
18
|
+
|
19
|
+
|
20
|
+
@torch.library.custom_op(
|
21
|
+
"rbln_custom_ops::paged_attn_decode",
|
22
|
+
mutates_args=(["kcache", "vcache"]),
|
23
|
+
)
|
24
|
+
def paged_attn_decode(
|
25
|
+
q: Tensor,
|
26
|
+
k: Tensor,
|
27
|
+
v: Tensor,
|
28
|
+
mask: Tensor,
|
29
|
+
kcache: Tensor,
|
30
|
+
vcache: Tensor,
|
31
|
+
seq: Tensor,
|
32
|
+
scale: Tensor,
|
33
|
+
block_table: Tensor,
|
34
|
+
block_size: int,
|
35
|
+
) -> Tensor:
|
36
|
+
return torch.empty_like(q)
|
37
|
+
|
38
|
+
|
39
|
+
@paged_attn_decode.register_fake
|
40
|
+
def paged_attn_decode_fake(
|
41
|
+
q: Tensor,
|
42
|
+
k: Tensor,
|
43
|
+
v: Tensor,
|
44
|
+
mask: Tensor,
|
45
|
+
kcache: Tensor,
|
46
|
+
vcache: Tensor,
|
47
|
+
seq: Tensor,
|
48
|
+
scale: Tensor,
|
49
|
+
block_table: Tensor,
|
50
|
+
block_size: int,
|
51
|
+
) -> Tensor:
|
52
|
+
return torch.empty_like(q)
|
53
|
+
|
54
|
+
|
55
|
+
@torch.library.custom_op(
|
56
|
+
"rbln_custom_ops::paged_attn_prefill",
|
57
|
+
mutates_args=(["kcache", "vcache"]),
|
58
|
+
)
|
59
|
+
def paged_attn_prefill(
|
60
|
+
q: Tensor,
|
61
|
+
k: Tensor,
|
62
|
+
v: Tensor,
|
63
|
+
mask: Tensor,
|
64
|
+
kcache: Tensor,
|
65
|
+
vcache: Tensor,
|
66
|
+
seq: Tensor,
|
67
|
+
scale: Tensor,
|
68
|
+
block_table: Tensor,
|
69
|
+
block_size: int,
|
70
|
+
) -> Tensor:
|
71
|
+
"""Defines the computation pattern for prefill phase attention with KV cache updates.
|
72
|
+
|
73
|
+
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
74
|
+
a single optimized NPU operation. It is NOT meant for CPU execution.
|
75
|
+
|
76
|
+
Key differences from decode pattern:
|
77
|
+
- Handles prefill phase with multiple input tokens
|
78
|
+
- Takes explicit batch index for continuous batching
|
79
|
+
|
80
|
+
Expected tensor shapes:
|
81
|
+
- q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
|
82
|
+
- k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
|
83
|
+
- v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
|
84
|
+
- mask: [batch=1, 1, 1, seq_len, max_seq_len] - Attention mask
|
85
|
+
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
86
|
+
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
87
|
+
- seq: [1, 1] - Starting sequence position
|
88
|
+
- scale: [] - Attention scale factor
|
89
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
90
|
+
- block_size: [] - Number of tokens per block
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
|
94
|
+
"""
|
95
|
+
return torch.empty_like(q)
|
96
|
+
|
97
|
+
|
98
|
+
@paged_attn_prefill.register_fake
|
99
|
+
def paged_attn_prefill_fake(
|
100
|
+
q: Tensor,
|
101
|
+
k: Tensor,
|
102
|
+
v: Tensor,
|
103
|
+
mask: Tensor,
|
104
|
+
kcache: Tensor,
|
105
|
+
vcache: Tensor,
|
106
|
+
seq: Tensor,
|
107
|
+
scale: Tensor,
|
108
|
+
block_table: Tensor,
|
109
|
+
block_size: int,
|
110
|
+
) -> Tensor:
|
111
|
+
return torch.empty_like(q)
|
112
|
+
|
113
|
+
|
114
|
+
@torch.library.custom_op(
|
115
|
+
"rbln_custom_ops::paged_causal_attn_decode",
|
116
|
+
mutates_args=(["kcache", "vcache"]),
|
117
|
+
)
|
118
|
+
def paged_causal_attn_decode(
|
119
|
+
q: Tensor,
|
120
|
+
k: Tensor,
|
121
|
+
v: Tensor,
|
122
|
+
kcache: Tensor,
|
123
|
+
vcache: Tensor,
|
124
|
+
seq: Tensor,
|
125
|
+
scale: Tensor,
|
126
|
+
block_table: Tensor,
|
127
|
+
block_size: int,
|
128
|
+
) -> Tensor:
|
129
|
+
"""Defines the computation pattern for fused attention with KV cache updates.
|
130
|
+
|
131
|
+
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
132
|
+
a single optimized NPU operation. It is NOT meant for CPU execution.
|
133
|
+
|
134
|
+
Pattern components that compiler fuses into a single op:
|
135
|
+
1. KV cache updates with new key/value states
|
136
|
+
2. Scaled dot-product attention computation
|
137
|
+
3. Causal masked softmax operation
|
138
|
+
4. Final attention output computation
|
139
|
+
|
140
|
+
Expected tensor shapes:
|
141
|
+
- q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
|
142
|
+
- k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
|
143
|
+
- v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
|
144
|
+
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
145
|
+
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
146
|
+
- seq: [1, 1] - Starting sequence position
|
147
|
+
- scale: [] - Attention scale factor
|
148
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
149
|
+
- block_size: [] - Number of tokens per block
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
Tensor: attn_output: [batch=1, n_heads, n_groups, 1, head_dim] - Attention output
|
153
|
+
"""
|
154
|
+
return torch.empty_like(q)
|
155
|
+
|
156
|
+
|
157
|
+
@paged_causal_attn_decode.register_fake
|
158
|
+
def paged_causal_attn_decode_fake(
|
159
|
+
q: Tensor,
|
160
|
+
k: Tensor,
|
161
|
+
v: Tensor,
|
162
|
+
kcache: Tensor,
|
163
|
+
vcache: Tensor,
|
164
|
+
seq: Tensor,
|
165
|
+
scale: Tensor,
|
166
|
+
block_table: Tensor,
|
167
|
+
block_size: int,
|
168
|
+
) -> Tensor:
|
169
|
+
return torch.empty_like(q)
|
170
|
+
|
171
|
+
|
172
|
+
@torch.library.custom_op(
|
173
|
+
"rbln_custom_ops::paged_causal_attn_prefill",
|
174
|
+
mutates_args=(["kcache", "vcache"]),
|
175
|
+
)
|
176
|
+
def paged_causal_attn_prefill(
|
177
|
+
q: Tensor,
|
178
|
+
k: Tensor,
|
179
|
+
v: Tensor,
|
180
|
+
kcache: Tensor,
|
181
|
+
vcache: Tensor,
|
182
|
+
seq: Tensor,
|
183
|
+
scale: Tensor,
|
184
|
+
block_table: Tensor,
|
185
|
+
block_size: int,
|
186
|
+
) -> Tensor:
|
187
|
+
"""Defines the computation pattern for prefill phase attention with KV cache updates.
|
188
|
+
|
189
|
+
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
190
|
+
a single optimized NPU operation. It is NOT meant for CPU execution.
|
191
|
+
|
192
|
+
Key differences from decode pattern:
|
193
|
+
- Handles prefill phase with multiple input tokens
|
194
|
+
- Takes explicit batch index for continuous batching
|
195
|
+
|
196
|
+
Expected tensor shapes:
|
197
|
+
- q: [batch=1, n_heads, n_groups, seq_len, head_dim] - Query states for multiple tokens
|
198
|
+
- k: [batch=1, n_heads, 1, seq_len, head_dim] - Key states for current input
|
199
|
+
- v: [batch=1, n_heads, 1, seq_len, head_dim] - Value states for current input
|
200
|
+
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
201
|
+
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
202
|
+
- batch: [1] - Batch index for cache access
|
203
|
+
- seq: [1, 1] - Starting sequence position
|
204
|
+
- scale: [] - Attention scale factor
|
205
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
206
|
+
- block_size: [] - Number of tokens per block
|
207
|
+
|
208
|
+
Returns:
|
209
|
+
Tensor: attn_output: [batch=1, n_heads, n_groups, seq_len, head_dim] - Attention output
|
210
|
+
"""
|
211
|
+
return torch.empty_like(q)
|
212
|
+
|
213
|
+
|
214
|
+
@paged_causal_attn_prefill.register_fake
|
215
|
+
def paged_causal_attn_prefill_fake(
|
216
|
+
q: Tensor,
|
217
|
+
k: Tensor,
|
218
|
+
v: Tensor,
|
219
|
+
kcache: Tensor,
|
220
|
+
vcache: Tensor,
|
221
|
+
seq: Tensor,
|
222
|
+
scale: Tensor,
|
223
|
+
block_table: Tensor,
|
224
|
+
block_size: int,
|
225
|
+
) -> Tensor:
|
226
|
+
return torch.empty_like(q)
|
227
|
+
|
228
|
+
|
229
|
+
@torch.library.custom_op(
|
230
|
+
"rbln_custom_ops::paged_add_softmax_attn_decode",
|
231
|
+
mutates_args=(["kcache", "vcache"]),
|
232
|
+
)
|
233
|
+
def paged_add_softmax_attn_decode(
|
234
|
+
q: Tensor,
|
235
|
+
k: Tensor,
|
236
|
+
v: Tensor,
|
237
|
+
mask: Tensor,
|
238
|
+
kcache: Tensor,
|
239
|
+
vcache: Tensor,
|
240
|
+
seq: Tensor,
|
241
|
+
scale: Tensor,
|
242
|
+
block_table: Tensor,
|
243
|
+
block_size: int,
|
244
|
+
) -> Tensor:
|
245
|
+
"""Defines the computation pattern for fused attention with KV cache updates.
|
246
|
+
|
247
|
+
IMPORTANT: This op serves as a pattern definition for the RBLN compiler to generate
|
248
|
+
a single optimized NPU operation. It is NOT meant for CPU execution.
|
249
|
+
|
250
|
+
Pattern components that compiler fuses into a single op:
|
251
|
+
1. KV cache updates with new key/value states
|
252
|
+
2. Scaled dot-product attention computation
|
253
|
+
3. add-softmax operation
|
254
|
+
4. Final attention output computation
|
255
|
+
|
256
|
+
Expected tensor shapes:
|
257
|
+
- q: [batch=1, n_heads, n_groups, 1, head_dim] - Query states for single token
|
258
|
+
- k: [batch=1, n_heads, 1, 1, head_dim] - Key states for current input
|
259
|
+
- v: [batch=1, n_heads, 1, 1, head_dim] - Value states for current input
|
260
|
+
- mask: [batch=1, n_heads, 1, 1, max_seq_len] - Attention mask
|
261
|
+
- kcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Key cache
|
262
|
+
- vcache: [batch_size, n_heads, 1, max_seq_len, head_dim] - Value cache
|
263
|
+
- seq: [1] - Current sequence position
|
264
|
+
- scale: [] - Attention scale factor
|
265
|
+
- block_table: [batch_size, max_seq_len // block_size] - Block indices for KV cache management
|
266
|
+
- block_size: [] - Number of tokens per block
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
Tensor: attn_output: [batch=1, n_heads, 1, 1, head_dim] - Attention output
|
270
|
+
"""
|
271
|
+
return torch.empty_like(q)
|
272
|
+
|
273
|
+
|
274
|
+
@paged_add_softmax_attn_decode.register_fake
|
275
|
+
def paged_add_softmax_attn_decode_fake(
|
276
|
+
q: Tensor,
|
277
|
+
k: Tensor,
|
278
|
+
v: Tensor,
|
279
|
+
mask: Tensor,
|
280
|
+
kcache: Tensor,
|
281
|
+
vcache: Tensor,
|
282
|
+
seq: Tensor,
|
283
|
+
scale: Tensor,
|
284
|
+
block_table: Tensor,
|
285
|
+
block_size: int,
|
286
|
+
) -> Tensor:
|
287
|
+
return torch.empty_like(q)
|
@@ -0,0 +1,176 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import torch
|
16
|
+
from torch import Tensor
|
17
|
+
|
18
|
+
|
19
|
+
@torch.library.custom_op(
|
20
|
+
"rbln_custom_ops::paged_flash_attn_decode",
|
21
|
+
mutates_args=(["kcache", "vcache"]),
|
22
|
+
)
|
23
|
+
def paged_flash_attn_decode(
|
24
|
+
q: Tensor,
|
25
|
+
k: Tensor,
|
26
|
+
v: Tensor,
|
27
|
+
mask: Tensor,
|
28
|
+
kcache: Tensor,
|
29
|
+
vcache: Tensor,
|
30
|
+
seq: Tensor,
|
31
|
+
scale: Tensor,
|
32
|
+
block_table: Tensor,
|
33
|
+
block_size: int,
|
34
|
+
partition: int,
|
35
|
+
) -> Tensor:
|
36
|
+
"""Defines the computation pattern for fused flash attention with KV cache for decoding.
|
37
|
+
|
38
|
+
Returns a tensor with the same shape as q.
|
39
|
+
"""
|
40
|
+
return torch.empty_like(q)
|
41
|
+
|
42
|
+
|
43
|
+
@paged_flash_attn_decode.register_fake
|
44
|
+
def paged_flash_attn_decode_fake(
|
45
|
+
q: Tensor,
|
46
|
+
k: Tensor,
|
47
|
+
v: Tensor,
|
48
|
+
mask: Tensor,
|
49
|
+
kcache: Tensor,
|
50
|
+
vcache: Tensor,
|
51
|
+
seq: Tensor,
|
52
|
+
scale: Tensor,
|
53
|
+
block_table: Tensor,
|
54
|
+
block_size: int,
|
55
|
+
partition: int,
|
56
|
+
) -> Tensor:
|
57
|
+
return torch.empty_like(q)
|
58
|
+
|
59
|
+
|
60
|
+
@torch.library.custom_op(
|
61
|
+
"rbln_custom_ops::paged_flash_attn_prefill",
|
62
|
+
mutates_args=(["kcache", "vcache"]),
|
63
|
+
)
|
64
|
+
def paged_flash_attn_prefill(
|
65
|
+
q: Tensor,
|
66
|
+
k: Tensor,
|
67
|
+
v: Tensor,
|
68
|
+
mask: Tensor,
|
69
|
+
kcache: Tensor,
|
70
|
+
vcache: Tensor,
|
71
|
+
seq: Tensor,
|
72
|
+
scale: Tensor,
|
73
|
+
block_table: Tensor,
|
74
|
+
block_size: int,
|
75
|
+
partition: int,
|
76
|
+
) -> Tensor:
|
77
|
+
"""Defines the computation pattern for fused flash attention with KV cache for prefill.
|
78
|
+
|
79
|
+
Returns a tensor with the same shape as q.
|
80
|
+
"""
|
81
|
+
return torch.empty_like(q)
|
82
|
+
|
83
|
+
|
84
|
+
@paged_flash_attn_prefill.register_fake
|
85
|
+
def paged_flash_attn_prefill_fake(
|
86
|
+
q: Tensor,
|
87
|
+
k: Tensor,
|
88
|
+
v: Tensor,
|
89
|
+
mask: Tensor,
|
90
|
+
kcache: Tensor,
|
91
|
+
vcache: Tensor,
|
92
|
+
seq: Tensor,
|
93
|
+
scale: Tensor,
|
94
|
+
block_table: Tensor,
|
95
|
+
block_size: int,
|
96
|
+
partition: int,
|
97
|
+
) -> Tensor:
|
98
|
+
return torch.empty_like(q)
|
99
|
+
|
100
|
+
|
101
|
+
@torch.library.custom_op(
|
102
|
+
"rbln_custom_ops::paged_flash_causal_attn_decode",
|
103
|
+
mutates_args=(["kcache", "vcache"]),
|
104
|
+
)
|
105
|
+
def paged_flash_causal_attn_decode(
|
106
|
+
q: Tensor,
|
107
|
+
k: Tensor,
|
108
|
+
v: Tensor,
|
109
|
+
kcache: Tensor,
|
110
|
+
vcache: Tensor,
|
111
|
+
seq: Tensor,
|
112
|
+
scale: Tensor,
|
113
|
+
block_table: Tensor,
|
114
|
+
block_size: int,
|
115
|
+
partition: int,
|
116
|
+
) -> Tensor:
|
117
|
+
"""Defines the computation pattern for fused causal flash attention with KV cache for decoding.
|
118
|
+
|
119
|
+
Returns a tensor with the same shape as q.
|
120
|
+
"""
|
121
|
+
return torch.empty_like(q)
|
122
|
+
|
123
|
+
|
124
|
+
@paged_flash_causal_attn_decode.register_fake
|
125
|
+
def paged_flash_causal_attn_decode_fake(
|
126
|
+
q: Tensor,
|
127
|
+
k: Tensor,
|
128
|
+
v: Tensor,
|
129
|
+
kcache: Tensor,
|
130
|
+
vcache: Tensor,
|
131
|
+
seq: Tensor,
|
132
|
+
scale: Tensor,
|
133
|
+
block_table: Tensor,
|
134
|
+
block_size: int,
|
135
|
+
partition: int,
|
136
|
+
) -> Tensor:
|
137
|
+
return torch.empty_like(q)
|
138
|
+
|
139
|
+
|
140
|
+
@torch.library.custom_op(
|
141
|
+
"rbln_custom_ops::paged_flash_causal_attn_prefill",
|
142
|
+
mutates_args=(["kcache", "vcache"]),
|
143
|
+
)
|
144
|
+
def paged_flash_causal_attn_prefill(
|
145
|
+
q: Tensor,
|
146
|
+
k: Tensor,
|
147
|
+
v: Tensor,
|
148
|
+
kcache: Tensor,
|
149
|
+
vcache: Tensor,
|
150
|
+
seq: Tensor,
|
151
|
+
scale: Tensor,
|
152
|
+
block_table: Tensor,
|
153
|
+
block_size: int,
|
154
|
+
partition: int,
|
155
|
+
) -> Tensor:
|
156
|
+
"""Defines the computation pattern for fused causal flash attention with KV cache for prefill.
|
157
|
+
|
158
|
+
Returns a tensor with the same shape as q.
|
159
|
+
"""
|
160
|
+
return torch.empty_like(q)
|
161
|
+
|
162
|
+
|
163
|
+
@paged_flash_causal_attn_prefill.register_fake
|
164
|
+
def paged_flash_causal_attn_prefill_fake(
|
165
|
+
q: Tensor,
|
166
|
+
k: Tensor,
|
167
|
+
v: Tensor,
|
168
|
+
kcache: Tensor,
|
169
|
+
vcache: Tensor,
|
170
|
+
seq: Tensor,
|
171
|
+
scale: Tensor,
|
172
|
+
block_table: Tensor,
|
173
|
+
block_size: int,
|
174
|
+
partition: int,
|
175
|
+
) -> Tensor:
|
176
|
+
return torch.empty_like(q)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2025 Rebellions Inc. All rights reserved.
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at:
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import torch
|
16
|
+
from torch import Tensor
|
17
|
+
|
18
|
+
|
19
|
+
@torch.library.custom_op("rbln_custom_ops::rbln_cache_update", mutates_args=(["cache"]))
|
20
|
+
def rbln_cache_update(cache: Tensor, state: Tensor, position: Tensor, axis: Tensor) -> Tensor:
|
21
|
+
# Define the RBLN custom operation "rbln_cache_update" which updates a cache tensor with a given state tensor.
|
22
|
+
# This operation is designed to perform in-place updates directly on the device without needing to transfer the cache back to the host.
|
23
|
+
# The `position` parameter specifies the start index for the update along the specified axis, allowing flexible updates to any part of the cache tensor.
|
24
|
+
return torch.empty_like(cache)
|
@@ -12,4 +12,14 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from ....ops import (
|
16
|
+
paged_attn_decode,
|
17
|
+
paged_attn_prefill,
|
18
|
+
paged_causal_attn_decode,
|
19
|
+
paged_causal_attn_prefill,
|
20
|
+
paged_flash_attn_decode,
|
21
|
+
paged_flash_attn_prefill,
|
22
|
+
paged_flash_causal_attn_decode,
|
23
|
+
paged_flash_causal_attn_prefill,
|
24
|
+
)
|
15
25
|
from .modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM
|