sinapsis-huggingface 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.0.dist-info}/METADATA +51 -13
  2. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.0.dist-info}/RECORD +17 -12
  3. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.0.dist-info}/WHEEL +1 -1
  4. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/__init__.py +3 -0
  5. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/base_transformers.py +11 -0
  6. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/image_to_text_transformers.py +1 -0
  7. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_base.py +97 -0
  8. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_detection.py +124 -0
  9. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_inference.py +260 -0
  10. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/speech_to_text_transformers.py +1 -0
  11. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/summarization_transformers.py +1 -0
  12. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/text_to_speech_transformers.py +2 -1
  13. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/translation_transformers.py +1 -0
  14. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/__init__.py +0 -0
  15. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/helpers.py +70 -0
  16. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.0.dist-info}/licenses/LICENSE +0 -0
  17. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-huggingface
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Package for HuggingFace-based templates
5
5
  Author-email: SinapsisAI <dev@sinapsis.tech>
6
6
  License: GNU AFFERO GENERAL PUBLIC LICENSE
@@ -822,22 +822,28 @@ The **Sinapsis web applications** provide an interactive way to explore and expe
822
822
  > [!IMPORTANT]
823
823
  > To run any of the apps, you first need to clone this repo:
824
824
 
825
+ ```bash
826
+ git clone git@github.com:Sinapsis-ai/sinapsis-huggingface.git
827
+ cd sinapsis-huggingface
828
+ ```
829
+
825
830
  > [!NOTE]
826
831
  > If you'd like to enable external app sharing in Gradio, `export GRADIO_SHARE_APP=True`
827
832
 
828
833
  > [!NOTE]
829
834
  > Agent configuration can be changed through the AGENT_CONFIG_PATH env var. You can check the available configurations in each package configs folder.
830
835
 
836
+ > [!IMPORTANT]
837
+ > Please make sure you have a valid huggingface access token in order to run the paligemma webapp. For further instructions on how to create an access token see
838
+ https://huggingface.co/docs/transformers.js/en/guides/private
839
+
840
+
831
841
 
832
- ```bash
833
- git clone git@github.com:Sinapsis-ai/sinapsis-huggingface.git
834
- cd sinapsis-huggingface
835
- ```
836
842
 
837
843
  <details>
838
844
  <summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Build with Docker</span></strong></summary>
839
845
 
840
- **IMPORTANT** The docker image depends on the sinapsis-nvidia:base image. To build it, refer to the [official sinapsis documentation]([https://](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker)
846
+ **IMPORTANT** The docker image depends on the sinapsis-nvidia:base image. To build it, refer to the [official sinapsis documentation](https://github.com/Sinapsis-AI/sinapsis/blob/main/README.md#docker)
841
847
 
842
848
 
843
849
  1. **Build the sinapsis-huggingface image**:
@@ -845,17 +851,35 @@ cd sinapsis-huggingface
845
851
  docker compose -f docker/compose.yaml build
846
852
  ```
847
853
  2. **Start the container**:
854
+
855
+ For Diffusers app
848
856
  ```bash
849
857
  docker compose -f docker/compose_diffusers.yaml up sinapsis-huggingface-diffusers-gradio -d
850
858
  ```
851
- **NOTE**: There is also a service to deploy the vision app. To do so, use:
859
+ For Grounding-Dino app
852
860
  ```bash
853
861
  docker compose -f docker/compose_vision.yaml up sinapsis-huggingface-vision-gradio -d
854
862
  ```
863
+ For Paligemma app
864
+
865
+ ```bash
866
+ export HF_TOKEN="your_huggingface_token"
867
+ docker compose -f docker/compose_pali_gemma.yaml up sinapsis-huggingface-paligemma-gradio -d
868
+ ```
855
869
  3. **Check the status**:
870
+
871
+ For Diffusers app
856
872
  ```bash
857
873
  docker logs -f sinapsis-huggingface-diffusers-gradio
858
874
  ```
875
+ For Grounding-Dino app
876
+ ```bash
877
+ docker logs -f sinapsis-huggingface-vision-gradio
878
+ ```
879
+ For Paligemma app
880
+ ```bash
881
+ docker logs -f sinapsis-huggingface-paligemma-gradio
882
+ ```
859
883
  **NOTE**: If using the vision app, please change the name of the service accordingly
860
884
 
861
885
  4. **The logs will display the URL to access the webapp, e.g.,**:
@@ -865,9 +889,19 @@ Running on local URL: http://127.0.0.1:7860
865
889
  **NOTE**: The local URL can be different, please check the logs
866
890
 
867
891
  5. **To stop the app**:
892
+
893
+ For Diffusers app
868
894
  ```bash
869
895
  docker compose -f docker/compose_diffusers.yaml down
870
896
  ```
897
+ For Grounding-Dino app
898
+ ```bash
899
+ docker compose -f docker/compose_vision.yaml down
900
+ ```
901
+ For Paligemma app
902
+ ```bash
903
+ docker compose -f docker/compose_pali_gemma.yaml down
904
+ ```
871
905
  </details>
872
906
 
873
907
  <details>
@@ -886,19 +920,23 @@ uv pip install sinapsis-huggingface[all] --extra-index-url https://pypi.sinapsis
886
920
  ```
887
921
  3. Run the webapp.
888
922
 
923
+ For Diffusers app
889
924
  ```bash
890
925
  uv run webapps/diffusers_demo.py
891
926
  ```
892
-
893
- 4. The terminal will display the URL to access the webapp, e.g., :
927
+ For Grounding-Dino app
894
928
  ```bash
895
- Running on local URL: http://127.0.0.1:7860
929
+ uv run webapps/vision_demo.py
930
+ ```
931
+ For Paligemma app
932
+ ```bash
933
+ export HF_TOKEN="your_huggingface_token"
934
+ uv run webapps/paligemma_demo.py
896
935
  ```
897
936
 
898
- **NOTE**: If you want to try the vision app, in step 5 change the command to:
899
-
937
+ 4. The terminal will display the URL to access the webapp, e.g., :
900
938
  ```bash
901
- python webapps/vision_demo.py
939
+ Running on local URL: http://127.0.0.1:7860
902
940
  ```
903
941
 
904
942
  </details>
@@ -1,4 +1,4 @@
1
- sinapsis_huggingface-0.1.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
1
+ sinapsis_huggingface-0.2.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
2
2
  sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/__init__.py,sha256=9FHbS4hse9WIE-1a5jJlG-23gB3wahlULANJAWQ464c,947
4
4
  sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/base_diffusers.py,sha256=bJOF3w4iwd9dwtwgvaN9tIlBYpgpFL-AIM1u1Zg3Cys,8248
@@ -20,14 +20,19 @@ sinapsis_huggingface_grounding_dino/src/sinapsis_huggingface_grounding_dino/temp
20
20
  sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/helpers/__init__.py,sha256=RYEd6xTaVlItleSPoq9RVJIFgXfY6aOHqy2SIO7zwjc,168
22
22
  sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/helpers/text_to_sentences.py,sha256=teaJXoTAVzGwar9gxenBabkA9VBJd-VAxsNXlzkKMuU,1676
23
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/__init__.py,sha256=9DyCy0TMGrSwgzoa0z_xwH6idpbTwSz7yyR4kKuLEY0,852
24
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/base_transformers.py,sha256=ExoV83NIjBnJZGbWLsFpb3bcTjzhTPGDXbTaSAuAP-Q,5235
25
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/image_to_text_transformers.py,sha256=LGiKWlkATlmOGht-6CNRfHHc94fSSUIZC8Zosu7Qq3Y,2571
26
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/speech_to_text_transformers.py,sha256=eiK7Mfrbpx5qmWaS0xi3nx7cX4ngmy2pN0sWXQA90P4,2318
27
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/summarization_transformers.py,sha256=ZcvISBOfnSyjiiEDoDvejm6dh06MxgvIOGzYAseql6o,1974
28
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/text_to_speech_transformers.py,sha256=FW73tLKORq5jMpJHYWufQV5j68nQG5viCI_zoMyL4Fk,5805
29
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/translation_transformers.py,sha256=TmiQ3yqY8GBxTjPvpAdxp2jqFbESHr0mHRZ7SqjuCVU,2506
30
- sinapsis_huggingface-0.1.0.dist-info/METADATA,sha256=ToTFGyWmCAN8X5NtN_ABswlsJAipHIDJmJEsoXqyaw4,50408
31
- sinapsis_huggingface-0.1.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
32
- sinapsis_huggingface-0.1.0.dist-info/top_level.txt,sha256=ZxHwnMjSWRceQL_6-B7GJBPxQWdlwkba-SYMVufhj5s,133
33
- sinapsis_huggingface-0.1.0.dist-info/RECORD,,
23
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/__init__.py,sha256=3BgUm6C_tRgzxh2ADMBcu6OHzR-U5Tl1eFVtU0PwxB0,1095
24
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/base_transformers.py,sha256=SJTfLHIkNidTQeh_EXdzKXEHnszWEdxiZ2F2dc0HGPc,5658
25
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/image_to_text_transformers.py,sha256=y4rOh4yYssM2PBIFdXJswPHOs0Y9sb5Bp7TSDbsCwGE,2601
26
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/speech_to_text_transformers.py,sha256=N_zQiWHcz6LiQjOfJdeOdauwxrlqI9O360v5GVE3TwQ,2348
27
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/summarization_transformers.py,sha256=b7GoMba6exEdCq9q6rOrDjoL7blxq8DKpQ_fCiOvwVM,2004
28
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/text_to_speech_transformers.py,sha256=PUpS4Kohe8D_5E5RYnYcEqFZd-koFBkm0r1Sihe_b70,5835
29
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/translation_transformers.py,sha256=okMCToQpqcKs4Y2gHyppJ6p4A3pm0drInqUMvSQw1jk,2536
30
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_base.py,sha256=O6pFLLR4uLtnMCV1Pn5HS7_Ab51vXdv3lkaI-UeCOsA,3707
31
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_detection.py,sha256=NrWxI8k7oVlLyaf7FjUuSc6J4eXK3ngM8RZwPb6tLL0,4122
32
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_inference.py,sha256=5RBXPUgxOBEM4UHrwfcmPW5dmrktDy44pXVkce8piRs,10387
33
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/helpers.py,sha256=IGeYd5U2xpimpwTQW_5xm1pUYB5tqHlpq-fjwBHI4gY,2187
35
+ sinapsis_huggingface-0.2.0.dist-info/METADATA,sha256=7riD8-0RoTZkA4-zBD_e1tvaeQKlAjHQAw998kXel80,51211
36
+ sinapsis_huggingface-0.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
37
+ sinapsis_huggingface-0.2.0.dist-info/top_level.txt,sha256=ZxHwnMjSWRceQL_6-B7GJBPxQWdlwkba-SYMVufhj5s,133
38
+ sinapsis_huggingface-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -6,6 +6,9 @@ _root_lib_path = "sinapsis_huggingface_transformers.templates"
6
6
 
7
7
  _template_lookup = {
8
8
  "ImageToTextTransformers": f"{_root_lib_path}.image_to_text_transformers",
9
+ "PaliGemmaDetection": f"{_root_lib_path}.pali_gemma.pali_gemma_detection",
10
+ "PaliGemmaInference": f"{_root_lib_path}.pali_gemma.pali_gemma_inference",
11
+ "PaliGemmaSegmentation": f"{_root_lib_path}.pali_gemma.pali_gemma_segmentation",
9
12
  "SpeechToTextTransformers": f"{_root_lib_path}.speech_to_text_transformers",
10
13
  "SummarizationTransformers": f"{_root_lib_path}.summarization_transformers",
11
14
  "TextToSpeechTransformers": f"{_root_lib_path}.text_to_speech_transformers",
@@ -63,6 +63,17 @@ class TransformersBase(Template):
63
63
  self._TORCH_DTYPE = {"float16": torch.float16, "float32": torch.float32}
64
64
  self.task: str | None = None
65
65
  self._set_seed()
66
+
67
+ def setup_pipeline(self) -> None:
68
+ """Initialize and configure the HuggingFace Transformers processing pipeline.
69
+
70
+ Raises:
71
+ ValueError: If called before the task attribute is set. The task must be
72
+ defined by the child class before pipeline initialization.
73
+ """
74
+ if self.task is None:
75
+ raise ValueError("'task' must be assigned before pipeline setup")
76
+
66
77
  self.processor = self._initialize_processor()
67
78
  self.pipeline = self.initialize_pipeline()
68
79
 
@@ -38,6 +38,7 @@ class ImageToTextTransformers(TransformersBase):
38
38
  def __init__(self, attributes: TemplateAttributeType) -> None:
39
39
  super().__init__(attributes)
40
40
  self.task = "image-to-text"
41
+ self.setup_pipeline()
41
42
 
42
43
  @staticmethod
43
44
  def _convert_to_pil(image_content: Image.Image | np.ndarray) -> Image.Image:
@@ -0,0 +1,97 @@
1
+ # -*- coding: utf-8 -*-
2
+ from abc import abstractmethod
3
+ from typing import Any, ClassVar, Literal
4
+
5
+ import torch
6
+ from sinapsis_core.data_containers.data_packet import DataContainer
7
+ from sinapsis_core.template_base import (
8
+ Template,
9
+ TemplateAttributes,
10
+ TemplateAttributeType,
11
+ )
12
+ from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
13
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
14
+
15
+
16
+ class PaliGemmaBaseAttributes(TemplateAttributes):
17
+ """Base attributes for PaliGemma models.
18
+
19
+ Attributes:
20
+ model_path (str): Path to the pretrained PaliGemma model. Can be either:
21
+ - A Hugging Face model identifier (e.g. 'facebook/pali-gemma-7b')
22
+ - A local directory path containing the model files
23
+ processor_path (str): Path to the model processor/tokenizer. Can be either:
24
+ - A Hugging Face model identifier
25
+ - A local directory path containing the processor files
26
+ model_cache_dir (str): Directory for caching model files when downloading from Hugging Face.
27
+ device (Literal["cuda", "cpu"]): Device to run the model on. Defaults to cpu.
28
+ max_new_tokens (int): Maximum number of tokens to generate. Defaults to 200.
29
+ torch_dtype (Literal["float16", "float32"]): Model precision type. Defaults to float16.
30
+ """
31
+
32
+ model_path: str
33
+ processor_path: str
34
+ model_cache_dir: str = str(SINAPSIS_CACHE_DIR)
35
+ device: Literal["cuda", "cpu"] = "cpu"
36
+ max_new_tokens: int = 200
37
+ torch_dtype: Literal["float16", "float32"] = "float16"
38
+
39
+
40
+ class PaliGemmaBase(Template):
41
+ """Base class for PaliGemma implementations."""
42
+
43
+ AttributesBaseModel = PaliGemmaBaseAttributes
44
+ CATEGORY = "Transformers"
45
+ _TORCH_DTYPE: ClassVar[dict[str, Any]] = {"float16": torch.float16, "float32": torch.float32}
46
+
47
+ def __init__(self, attributes: TemplateAttributeType) -> None:
48
+ super().__init__(attributes)
49
+ self.model = self._setup_model()
50
+ self.processor = self._setup_processor()
51
+
52
+ def _setup_model(
53
+ self,
54
+ ) -> PaliGemmaForConditionalGeneration:
55
+ """Initialize model with proper device placement and precision settings.
56
+
57
+ Handles the loading of model components, configuring
58
+ it according to the specified device and precision requirements.
59
+
60
+ Returns:
61
+ PaliGemmaForConditionalGeneration: Initialized and configured model.
62
+ """
63
+
64
+ model = PaliGemmaForConditionalGeneration.from_pretrained(
65
+ self.attributes.model_path,
66
+ cache_dir=self.attributes.model_cache_dir,
67
+ torch_dtype=self._TORCH_DTYPE.get(self.attributes.torch_dtype),
68
+ ).to(self.attributes.device)
69
+
70
+ return model
71
+
72
+ def _setup_processor(self) -> AutoProcessor:
73
+ """Initialize processor with proper device placement and precision settings.
74
+
75
+ Handles the loading of processor components, configuring
76
+ it according to the specified cache and precision requirements.
77
+
78
+ Returns:
79
+ AutoProcessor: Initialized and configured processor.
80
+ """
81
+ processor = AutoProcessor.from_pretrained(
82
+ self.attributes.processor_path,
83
+ cache_dir=self.attributes.model_cache_dir,
84
+ torch_dtype=self._TORCH_DTYPE.get(self.attributes.torch_dtype),
85
+ )
86
+ return processor
87
+
88
+ @abstractmethod
89
+ def execute(self, container: DataContainer) -> DataContainer:
90
+ """Execute method to be implemented by child classes.
91
+
92
+ Args:
93
+ container (DataContainer): The input data container to be processed.
94
+
95
+ Returns:
96
+ DataContainer: The processed container with model outputs.
97
+ """
@@ -0,0 +1,124 @@
1
+ # -*- coding: utf-8 -*-
2
+ from dataclasses import dataclass
3
+
4
+ from sinapsis_core.data_containers.annotations import ImageAnnotations
5
+ from sinapsis_core.template_base import TemplateAttributeType
6
+ from sinapsis_huggingface_transformers.templates.pali_gemma.pali_gemma_inference import (
7
+ PaliGemmaInference,
8
+ PaliGemmaInferenceAttributes,
9
+ )
10
+ from sinapsis_huggingface_transformers.thirdparty.helpers import (
11
+ get_matches,
12
+ parse_label,
13
+ parse_location_tokens,
14
+ )
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class PaliGemmaDetectionKeys:
19
+ "Keys to use during detection"
20
+
21
+ detection_prompt: str = "detect {}"
22
+
23
+
24
+ class PaliGemmaDetectionAttributes(PaliGemmaInferenceAttributes):
25
+ """Configuration attributes for PaliGemma object detection tasks.
26
+
27
+ This class extends the base inference attributes to handle object detection specific configurations.
28
+
29
+ Attributes:
30
+ objects_to_detect (str | list[str]): Target objects to detect, can be a single string or list of strings
31
+ """
32
+
33
+ objects_to_detect: str | list[str]
34
+
35
+
36
+ class PaliGemmaDetection(PaliGemmaInference):
37
+ """Implementation of PaliGemma object detection pipeline.
38
+
39
+ The template inherits functionality from its base class, extending
40
+ the functionality to run inference on an image and to identify
41
+ the objects from the attributes.
42
+
43
+ Usage example:
44
+
45
+ agent:
46
+ name: my_test_agent
47
+ templates:
48
+ - template_name: InputTemplate
49
+ class_name: InputTemplate
50
+ attributes: {}
51
+ - template_name: PaliGemmaDetection
52
+ class_name: PaliGemmaDetection
53
+ template_input: InputTemplate
54
+ attributes:
55
+ model_path: '/path/to/paligemma/model'
56
+ processor_path: '`/path/to/processor'
57
+ model_cache_dir: /path/to/cache/dir
58
+ device: 'cuda'
59
+ max_new_tokens: 200
60
+ torch_dtype: float16
61
+ prompt: <image> caption en
62
+ objects_to_detect: 'object to detect'
63
+
64
+ """
65
+
66
+ AttributesBaseModel = PaliGemmaDetectionAttributes
67
+ KEYS = PaliGemmaDetectionKeys
68
+
69
+ def __init__(self, attributes: TemplateAttributeType) -> None:
70
+ super().__init__(attributes)
71
+
72
+ objects_str = self.initialize_objects_str()
73
+ self.prompt = self.KEYS.detection_prompt.format(objects_str)
74
+
75
+ def initialize_objects_str(self) -> str:
76
+ """
77
+ Initialize the objects to detect string according to the specified format.
78
+
79
+ Returns:
80
+ str: String enlisting the objects to be defined in the detection prompt.
81
+ """
82
+
83
+ if isinstance(self.attributes.objects_to_detect, str):
84
+ return self.attributes.objects_to_detect
85
+ return "; ".join(self.attributes.objects_to_detect)
86
+
87
+ def _format_text_for_prompt(self, text: str) -> str:
88
+ """Formats input text as a detection prompt.
89
+
90
+ Args:
91
+ text (str): Raw text content (expected to be objects to detect)
92
+
93
+ Returns:
94
+ str: Formatted detection prompt
95
+ """
96
+ return self.KEYS.detection_prompt.format(text)
97
+
98
+ def _create_annotation(
99
+ self, caption: str, confidence: float, image_shape: tuple[int, ...]
100
+ ) -> list[ImageAnnotations]:
101
+ """Creates structured annotations from detection model outputs.
102
+
103
+ Processes the model's output caption to extract bounding box coordinates
104
+ and object labels for each detected instance.
105
+
106
+ Args:
107
+ caption (str): Raw detection output from the model
108
+ confidence (float): Confidence score for the predictions
109
+ image_shape (tuple[int, ...]): Dimensions of the input image (height, width)
110
+
111
+ Returns:
112
+ list[ImageAnnotations]: List of annotations containing bounding boxes and labels
113
+ for each detected object
114
+ """
115
+ annotations = []
116
+ matches = get_matches(caption)
117
+
118
+ for match_coord in matches:
119
+ coords = parse_location_tokens(match_coord, image_shape)
120
+ label = parse_label(match_coord)
121
+ annotation = self.create_bbox_annotation(coords, label, confidence)
122
+ annotations.append(annotation)
123
+
124
+ return annotations
@@ -0,0 +1,260 @@
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ import torch
4
+ from sinapsis_core.data_containers.annotations import BoundingBox, ImageAnnotations
5
+ from sinapsis_core.data_containers.data_packet import DataContainer, ImagePacket
6
+ from sinapsis_core.template_base import TemplateAttributeType
7
+ from sinapsis_data_visualization.helpers.detection_utils import bbox_xyxy_to_xywh
8
+ from sinapsis_huggingface_transformers.templates.pali_gemma.pali_gemma_base import (
9
+ PaliGemmaBase,
10
+ PaliGemmaBaseAttributes,
11
+ )
12
+ from transformers.generation.utils import GenerateOutput
13
+
14
+
15
+ class PaliGemmaInferenceAttributes(PaliGemmaBaseAttributes):
16
+ """Configuration attributes for PaliGemma inference.
17
+
18
+ Attributes:
19
+ prompt (str): Prompt to run the inference (default: "<image>caption en")
20
+
21
+ The <image> token is essential as it serves as a marker that tells the model where to look at the image
22
+ when processing the input. This token enables the model to understand the relationship between the visual
23
+ and textual components during processing.
24
+
25
+ Example prompts:
26
+ - "<image>caption en" -> Generates a basic caption in English
27
+ - "<image>What objects can you see in this image?" -> Lists objects in the image
28
+ """
29
+
30
+ prompt: str = "<image>caption en"
31
+
32
+
33
+ class PaliGemmaInference(PaliGemmaBase):
34
+ """Implementation of PaliGemma inference pipeline for image processing and caption generation.
35
+
36
+ This class handles the inference process for PaliGemma models, including image processing,
37
+ caption generation, and annotation creation. It supports both basic captioning and
38
+ detection/segmentation tasks.
39
+
40
+ Usage example:
41
+
42
+ agent:
43
+ name: my_test_agent
44
+ templates:
45
+ - template_name: InputTemplate
46
+ class_name: InputTemplate
47
+ attributes: {}
48
+ - template_name: PaliGemmaInference
49
+ class_name: PaliGemmaInference
50
+ template_input: InputTemplate
51
+ attributes:
52
+ model_path: '/path/to/paligemma/model'
53
+ processor_path: '`/path/to/processor'
54
+ model_cache_dir: /path/to/cache/dir
55
+ device: 'cuda'
56
+ max_new_tokens: 200
57
+ torch_dtype: float16
58
+ prompt: <image> caption en
59
+
60
+ """
61
+
62
+ AttributesBaseModel = PaliGemmaInferenceAttributes
63
+ INPUT_IDS = "input_ids"
64
+
65
+ def __init__(self, attributes: TemplateAttributeType) -> None:
66
+ super().__init__(attributes)
67
+ self.prompt = self.attributes.prompt
68
+
69
+ def _prepare_inputs(self, image_content: np.ndarray) -> dict:
70
+ """Prepares the input for model inference by processing the image and text prompt.
71
+
72
+ Args:
73
+ image_content (np.ndarray): Raw image content to be processed as a numpy array
74
+
75
+ Returns:
76
+ dict: Processed inputs containing:
77
+ - input_ids (torch.Tensor): Token IDs for the text prompt and image tokens
78
+ - attention_mask (torch.Tensor): Binary mask indicating valid input positions (1s)
79
+ - pixel_values (torch.Tensor): Processed image tensor with normalized pixel values
80
+ in shape (batch_size, channels, height, width)
81
+
82
+ Note:
83
+ - The format of the returns it's because uses PyTorch tensors (return_tensors="pt")
84
+ """
85
+
86
+ return self.processor(
87
+ images=image_content,
88
+ text=self.prompt,
89
+ return_tensors="pt",
90
+ ).to(self.attributes.device)
91
+
92
+ def _generate_caption(self, inputs: dict) -> torch.Tensor:
93
+ """Generates caption using the model.
94
+
95
+ Args:
96
+ inputs (dict): Processed model inputs for the processor, including input IDs of the image and prompt
97
+
98
+ Returns:
99
+ GeneratedCaptionOutput: A structured output containing:
100
+ - sequences: tensor with token IDs of the generated sequence
101
+ - scores: tuple of tensors with token prediction scores for each generation step
102
+ - logits: optional tensor with raw logits (None in this configuration)
103
+ - attentions: optional attention weights (None in this configuration)
104
+ - hidden_states: optional hidden states (None in this configuration)
105
+ - past_key_values: tuple of tensors containing past keys/values for attention mechanism
106
+
107
+ Configuration parameters:
108
+ - max_new_tokens: Maximum number of new tokens to generate
109
+ - return_dict_in_generate: Returns output as a structured dictionary
110
+ - output_scores: Includes prediction scores in the output
111
+ """
112
+ with torch.no_grad():
113
+ return self.model.generate(
114
+ **inputs,
115
+ max_new_tokens=self.attributes.max_new_tokens,
116
+ return_dict_in_generate=True,
117
+ output_scores=True,
118
+ )
119
+
120
+ @staticmethod
121
+ def _calculate_confidence_score(outputs: GenerateOutput) -> float:
122
+ """Calculates the confidence score from model generation outputs.
123
+
124
+ The confidence score is computed as the mean of the highest probability
125
+ for each generated token in the sequence.
126
+
127
+ Args:
128
+ outputs (GenerateOutput): Model generation output containing scores
129
+ for each generated token
130
+
131
+ Returns:
132
+ float: Average confidence score across all generated tokens
133
+ """
134
+ scores = torch.stack(outputs.scores)
135
+ probs = torch.softmax(scores, dim=-1)
136
+ token_confidences = torch.max(probs, dim=-1).values
137
+ return float(torch.mean(token_confidences).cpu())
138
+
139
+ def _decode_caption(self, outputs: GenerateOutput, input_len: int) -> str:
140
+ """Decodes the model output sequences into readable caption text.
141
+
142
+ Args:
143
+ outputs (GenerateOutput): Model generation output containing the
144
+ generated token sequences
145
+ input_len (int): Length of the input sequence to skip initial tokens
146
+
147
+ Returns:
148
+ str: Decoded caption text with special tokens removed
149
+ """
150
+ return self.processor.decode(outputs.sequences[0][input_len:], skip_special_tokens=True)
151
+
152
+ def _create_annotation(
153
+ self, caption: str, confidence: float, image_shape: tuple[int, ...]
154
+ ) -> list[ImageAnnotations]:
155
+ """Creates image annotations from the generated caption.
156
+
157
+ Args:
158
+ caption (str): Generated caption text
159
+ confidence (float): Confidence score for the prediction
160
+ image_shape (tuple[int, ...]): Shape of the input image
161
+
162
+ Returns:
163
+ list[ImageAnnotations]: List containing annotation with caption information
164
+ """
165
+
166
+ _, _ = self, image_shape
167
+ return [ImageAnnotations(text=caption, confidence_score=confidence)]
168
+
169
+ def _process_single_image(self, image_packet: ImagePacket) -> None:
170
+ """Processes a single image through the inference pipeline.
171
+
172
+ Args:
173
+ image_packet (ImagePacket): Container with image data and metadata
174
+
175
+ Returns:
176
+ None: Modifies the image_packet in place by adding annotations
177
+ """
178
+ inputs = self._prepare_inputs(image_packet.content)
179
+ outputs = self._generate_caption(inputs)
180
+ input_len = inputs[self.INPUT_IDS].shape[-1]
181
+ caption = self._decode_caption(outputs, input_len)
182
+ confidence = self._calculate_confidence_score(outputs)
183
+ annotations = self._create_annotation(caption, confidence, image_packet.content.shape)
184
+ image_packet.annotations.extend(annotations)
185
+
186
+ def _format_text_for_prompt(self, text: str) -> str:
187
+ """Formats the incoming text appropriately for the current task type.
188
+ Base implementation returns the text as-is, subclasses may override
189
+ to apply task-specific formatting.
190
+ Args:
191
+ text (str): Raw text content
192
+ Returns:
193
+ str: Formatted prompt text
194
+ """
195
+ _ = self
196
+ return text
197
+
198
+ def process_from_text_packet(self, container: DataContainer) -> None:
199
+ """
200
+ Extract prompts from the received list of text packets and use them to perform inference in each received image
201
+ packet.
202
+
203
+ Args:
204
+ container (DataContainer): Data-container with text and image packets to be processed.
205
+ """
206
+ for text_packet in container.texts:
207
+ self.prompt = self._format_text_for_prompt(text_packet.content)
208
+ if container.images:
209
+ for image_packet in container.images:
210
+ self._process_single_image(image_packet)
211
+
212
+ def process_from_prompt(self, container: DataContainer) -> None:
213
+ """
214
+ Perform inference in each received image packet using the prompt defined in template attributes.
215
+
216
+ Args:
217
+ container (DataContainer): Data-container with image packets to be processed.
218
+ """
219
+ if container.images:
220
+ for image_packet in container.images:
221
+ self._process_single_image(image_packet)
222
+
223
+ def execute(self, container: DataContainer) -> DataContainer:
224
+ """Executes the inference pipeline on a batch of images.
225
+
226
+ If text packets are present, uses each text as input for prompt formatting.
227
+ If no text packets exist, uses the default prompt from attributes.
228
+
229
+ Args:
230
+ container (DataContainer): Container with text and image packets
231
+
232
+ Returns:
233
+ DataContainer: Processed container with added annotations
234
+ """
235
+ if container.texts:
236
+ self.process_from_text_packet(container)
237
+ else:
238
+ self.process_from_prompt(container)
239
+
240
+ return container
241
+
242
+ @staticmethod
243
+ def create_bbox_annotation(coords: tuple[float, ...], label: str, confidence: float) -> ImageAnnotations:
244
+ """Creates bounding box annotation from coordinates and metadata.
245
+
246
+ Args:
247
+ coords (tuple[float, ...]): Coordinates (x0, y0, x1, y1)
248
+ label (str): Label for the detected object
249
+ confidence (float): Confidence score for the detection
250
+
251
+ Returns:
252
+ ImageAnnotations: Annotation object with bounding box information
253
+ """
254
+ x0, y0, x1, y1 = coords
255
+ x, y, w, h = bbox_xyxy_to_xywh([x0, y0, x1, y1])
256
+ return ImageAnnotations(
257
+ label_str=label,
258
+ confidence_score=confidence,
259
+ bbox=BoundingBox(x=x, y=y, w=w, h=h),
260
+ )
@@ -39,6 +39,7 @@ class SpeechToTextTransformers(TransformersBase):
39
39
  def __init__(self, attributes: TemplateAttributeType) -> None:
40
40
  super().__init__(attributes)
41
41
  self.task = "automatic-speech-recognition"
42
+ self.setup_pipeline()
42
43
 
43
44
  def transformation_method(self, container: DataContainer) -> DataContainer:
44
45
  """Speech recognition (speech-to-text) using a Transformers Pipeline.
@@ -38,6 +38,7 @@ class SummarizationTransformers(TransformersBase):
38
38
  def __init__(self, attributes: TemplateAttributeType) -> None:
39
39
  super().__init__(attributes)
40
40
  self.task = "summarization"
41
+ self.setup_pipeline()
41
42
 
42
43
  def transformation_method(self, container: DataContainer) -> DataContainer:
43
44
  """Summarize text using a Transformers Pipeline.
@@ -64,8 +64,9 @@ class TextToSpeechTransformers(TransformersBase):
64
64
 
65
65
  def __init__(self, attributes: TemplateAttributeType) -> None:
66
66
  super().__init__(attributes)
67
- self.sample_rate = self._get_sample_rate()
68
67
  self.task = "text-to-speech"
68
+ self.setup_pipeline()
69
+ self.sample_rate = self._get_sample_rate()
69
70
 
70
71
  def _get_sample_rate(self) -> int:
71
72
  """Retrieve the sample rate for the generated audio.
@@ -56,6 +56,7 @@ class TranslationTransformers(TransformersBase):
56
56
  def __init__(self, attributes: TemplateAttributeType) -> None:
57
57
  super().__init__(attributes)
58
58
  self.task = f"translation_{self.attributes.source_language}_to_{self.attributes.target_language}"
59
+ self.setup_pipeline()
59
60
 
60
61
  def transformation_method(self, container: DataContainer) -> DataContainer:
61
62
  """Translate text using a Transformers Pipeline.
@@ -0,0 +1,70 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ The constants and methods declared in this file are inspired in the following source:
4
+
5
+ https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
6
+
7
+ which is Licensed under the Apache License, Version 2.0.
8
+
9
+ """
10
+
11
+ import numpy as np
12
+ import regex as re
13
+
14
+ COORDS_PATTERN: str = r"<loc(?P<y0>\d\d\d\d)><loc(?P<x0>\d\d\d\d)><loc(?P<y1>\d\d\d\d)><loc(?P<x1>\d\d\d\d)>"
15
+ LABEL_PATTERN: str = r" (?P<label>.+?)( ;|$)"
16
+
17
+ DETECTION_PATTERN: str = COORDS_PATTERN + LABEL_PATTERN
18
+
19
+ LOCATION_KEYS: tuple[str, ...] = ("y0", "x0", "y1", "x1")
20
+ LOCATION_SCALE: float = 1024.0
21
+
22
+
23
+ def parse_location_tokens(match_coord: re.Match, image_shape: tuple[int, ...]) -> np.ndarray:
24
+ """Parses location tokens from model output into normalized coordinates.
25
+
26
+ Args:
27
+ match_coord (dict): Dictionary containing matched location tokens
28
+ image_shape (tuple[int, ...]): Shape of the input image
29
+
30
+ Returns:
31
+ np.ndarray: Normalized coordinates (x0, y0, x1, y1)
32
+ """
33
+ match_dict = match_coord.groupdict()
34
+ x0 = float(match_dict[LOCATION_KEYS[1]]) / LOCATION_SCALE * image_shape[1]
35
+ y0 = float(match_dict[LOCATION_KEYS[0]]) / LOCATION_SCALE * image_shape[0]
36
+ x1 = float(match_dict[LOCATION_KEYS[3]]) / LOCATION_SCALE * image_shape[1]
37
+ y1 = float(match_dict[LOCATION_KEYS[2]]) / LOCATION_SCALE * image_shape[0]
38
+ return np.array([x0, y0, x1, y1])
39
+
40
+
41
+ def parse_label(match_coord: re.Match) -> str:
42
+ """
43
+ Retrieves detection label from a regex Match object.
44
+
45
+
46
+ Args:
47
+ match_coord (Match): The Match object containing the label information.
48
+
49
+ Returns:
50
+ str: The detection label.
51
+ """
52
+ label = match_coord.groupdict().get("label")
53
+ if label is None:
54
+ return ""
55
+ return label.strip()
56
+
57
+
58
+ def get_matches(caption: str) -> re.Scanner:
59
+ """
60
+ Creates an iterable containing all the detection matches found in the
61
+ produced model caption.
62
+
63
+ Args:
64
+ caption (str): The caption produced by the paligemma model.
65
+
66
+ Returns:
67
+ Scanner: An iterable object containing all the regex matches.
68
+ """
69
+
70
+ return re.finditer(DETECTION_PATTERN, caption)