sinapsis-huggingface 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. sinapsis_huggingface-0.2.1.dist-info/METADATA +297 -0
  2. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/RECORD +26 -21
  3. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/WHEEL +1 -1
  4. sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/base_diffusers.py +2 -2
  5. sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/image_to_image_diffusers.py +1 -1
  6. sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/image_to_video_gen_xl_diffusers.py +1 -1
  7. sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/inpainting_diffusers.py +1 -1
  8. sinapsis_huggingface_embeddings/src/sinapsis_huggingface_embeddings/templates/hugging_face_embedding_extractor.py +2 -2
  9. sinapsis_huggingface_embeddings/src/sinapsis_huggingface_embeddings/templates/speaker_embedding_from_audio.py +2 -1
  10. sinapsis_huggingface_embeddings/src/sinapsis_huggingface_embeddings/templates/speaker_embedding_from_dataset.py +2 -4
  11. sinapsis_huggingface_grounding_dino/src/sinapsis_huggingface_grounding_dino/templates/grounding_dino.py +2 -1
  12. sinapsis_huggingface_grounding_dino/src/sinapsis_huggingface_grounding_dino/templates/grounding_dino_classification.py +0 -7
  13. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/__init__.py +3 -0
  14. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/base_transformers.py +13 -2
  15. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/image_to_text_transformers.py +2 -1
  16. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_base.py +97 -0
  17. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_detection.py +124 -0
  18. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_inference.py +260 -0
  19. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/speech_to_text_transformers.py +2 -1
  20. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/summarization_transformers.py +2 -1
  21. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/text_to_speech_transformers.py +3 -2
  22. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/translation_transformers.py +2 -1
  23. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/__init__.py +0 -0
  24. sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/helpers.py +70 -0
  25. sinapsis_huggingface-0.1.0.dist-info/METADATA +0 -921
  26. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/licenses/LICENSE +0 -0
  27. {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,124 @@
1
+ # -*- coding: utf-8 -*-
2
+ from dataclasses import dataclass
3
+
4
+ from sinapsis_core.data_containers.annotations import ImageAnnotations
5
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
6
+ from sinapsis_huggingface_transformers.templates.pali_gemma.pali_gemma_inference import (
7
+ PaliGemmaInference,
8
+ PaliGemmaInferenceAttributes,
9
+ )
10
+ from sinapsis_huggingface_transformers.thirdparty.helpers import (
11
+ get_matches,
12
+ parse_label,
13
+ parse_location_tokens,
14
+ )
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class PaliGemmaDetectionKeys:
19
+ "Keys to use during detection"
20
+
21
+ detection_prompt: str = "detect {}"
22
+
23
+
24
+ class PaliGemmaDetectionAttributes(PaliGemmaInferenceAttributes):
25
+ """Configuration attributes for PaliGemma object detection tasks.
26
+
27
+ This class extends the base inference attributes to handle object detection specific configurations.
28
+
29
+ Attributes:
30
+ objects_to_detect (str | list[str]): Target objects to detect, can be a single string or list of strings
31
+ """
32
+
33
+ objects_to_detect: str | list[str]
34
+
35
+
36
+ class PaliGemmaDetection(PaliGemmaInference):
37
+ """Implementation of PaliGemma object detection pipeline.
38
+
39
+ The template inherits functionality from its base class, extending
40
+ the functionality to run inference on an image and to identify
41
+ the objects from the attributes.
42
+
43
+ Usage example:
44
+
45
+ agent:
46
+ name: my_test_agent
47
+ templates:
48
+ - template_name: InputTemplate
49
+ class_name: InputTemplate
50
+ attributes: {}
51
+ - template_name: PaliGemmaDetection
52
+ class_name: PaliGemmaDetection
53
+ template_input: InputTemplate
54
+ attributes:
55
+ model_path: '/path/to/paligemma/model'
56
+ processor_path: '`/path/to/processor'
57
+ model_cache_dir: /path/to/cache/dir
58
+ device: 'cuda'
59
+ max_new_tokens: 200
60
+ torch_dtype: float16
61
+ prompt: <image> caption en
62
+ objects_to_detect: 'object to detect'
63
+
64
+ """
65
+
66
+ AttributesBaseModel = PaliGemmaDetectionAttributes
67
+ KEYS = PaliGemmaDetectionKeys
68
+
69
+ def __init__(self, attributes: TemplateAttributeType) -> None:
70
+ super().__init__(attributes)
71
+
72
+ objects_str = self.initialize_objects_str()
73
+ self.prompt = self.KEYS.detection_prompt.format(objects_str)
74
+
75
+ def initialize_objects_str(self) -> str:
76
+ """
77
+ Initialize the objects to detect string according to the specified format.
78
+
79
+ Returns:
80
+ str: String enlisting the objects to be defined in the detection prompt.
81
+ """
82
+
83
+ if isinstance(self.attributes.objects_to_detect, str):
84
+ return self.attributes.objects_to_detect
85
+ return "; ".join(self.attributes.objects_to_detect)
86
+
87
+ def _format_text_for_prompt(self, text: str) -> str:
88
+ """Formats input text as a detection prompt.
89
+
90
+ Args:
91
+ text (str): Raw text content (expected to be objects to detect)
92
+
93
+ Returns:
94
+ str: Formatted detection prompt
95
+ """
96
+ return self.KEYS.detection_prompt.format(text)
97
+
98
+ def _create_annotation(
99
+ self, caption: str, confidence: float, image_shape: tuple[int, ...]
100
+ ) -> list[ImageAnnotations]:
101
+ """Creates structured annotations from detection model outputs.
102
+
103
+ Processes the model's output caption to extract bounding box coordinates
104
+ and object labels for each detected instance.
105
+
106
+ Args:
107
+ caption (str): Raw detection output from the model
108
+ confidence (float): Confidence score for the predictions
109
+ image_shape (tuple[int, ...]): Dimensions of the input image (height, width)
110
+
111
+ Returns:
112
+ list[ImageAnnotations]: List of annotations containing bounding boxes and labels
113
+ for each detected object
114
+ """
115
+ annotations = []
116
+ matches = get_matches(caption)
117
+
118
+ for match_coord in matches:
119
+ coords = parse_location_tokens(match_coord, image_shape)
120
+ label = parse_label(match_coord)
121
+ annotation = self.create_bbox_annotation(coords, label, confidence)
122
+ annotations.append(annotation)
123
+
124
+ return annotations
@@ -0,0 +1,260 @@
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ import torch
4
+ from sinapsis_core.data_containers.annotations import BoundingBox, ImageAnnotations
5
+ from sinapsis_core.data_containers.data_packet import DataContainer, ImagePacket
6
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
7
+ from sinapsis_data_visualization.helpers.detection_utils import bbox_xyxy_to_xywh
8
+ from sinapsis_huggingface_transformers.templates.pali_gemma.pali_gemma_base import (
9
+ PaliGemmaBase,
10
+ PaliGemmaBaseAttributes,
11
+ )
12
+ from transformers.generation.utils import GenerateOutput
13
+
14
+
15
+ class PaliGemmaInferenceAttributes(PaliGemmaBaseAttributes):
16
+ """Configuration attributes for PaliGemma inference.
17
+
18
+ Attributes:
19
+ prompt (str): Prompt to run the inference (default: "<image>caption en")
20
+
21
+ The <image> token is essential as it serves as a marker that tells the model where to look at the image
22
+ when processing the input. This token enables the model to understand the relationship between the visual
23
+ and textual components during processing.
24
+
25
+ Example prompts:
26
+ - "<image>caption en" -> Generates a basic caption in English
27
+ - "<image>What objects can you see in this image?" -> Lists objects in the image
28
+ """
29
+
30
+ prompt: str = "<image>caption en"
31
+
32
+
33
+ class PaliGemmaInference(PaliGemmaBase):
34
+ """Implementation of PaliGemma inference pipeline for image processing and caption generation.
35
+
36
+ This class handles the inference process for PaliGemma models, including image processing,
37
+ caption generation, and annotation creation. It supports both basic captioning and
38
+ detection/segmentation tasks.
39
+
40
+ Usage example:
41
+
42
+ agent:
43
+ name: my_test_agent
44
+ templates:
45
+ - template_name: InputTemplate
46
+ class_name: InputTemplate
47
+ attributes: {}
48
+ - template_name: PaliGemmaInference
49
+ class_name: PaliGemmaInference
50
+ template_input: InputTemplate
51
+ attributes:
52
+ model_path: '/path/to/paligemma/model'
53
+ processor_path: '`/path/to/processor'
54
+ model_cache_dir: /path/to/cache/dir
55
+ device: 'cuda'
56
+ max_new_tokens: 200
57
+ torch_dtype: float16
58
+ prompt: <image> caption en
59
+
60
+ """
61
+
62
+ AttributesBaseModel = PaliGemmaInferenceAttributes
63
+ INPUT_IDS = "input_ids"
64
+
65
+ def __init__(self, attributes: TemplateAttributeType) -> None:
66
+ super().__init__(attributes)
67
+ self.prompt = self.attributes.prompt
68
+
69
+ def _prepare_inputs(self, image_content: np.ndarray) -> dict:
70
+ """Prepares the input for model inference by processing the image and text prompt.
71
+
72
+ Args:
73
+ image_content (np.ndarray): Raw image content to be processed as a numpy array
74
+
75
+ Returns:
76
+ dict: Processed inputs containing:
77
+ - input_ids (torch.Tensor): Token IDs for the text prompt and image tokens
78
+ - attention_mask (torch.Tensor): Binary mask indicating valid input positions (1s)
79
+ - pixel_values (torch.Tensor): Processed image tensor with normalized pixel values
80
+ in shape (batch_size, channels, height, width)
81
+
82
+ Note:
83
+ - The format of the returns it's because uses PyTorch tensors (return_tensors="pt")
84
+ """
85
+
86
+ return self.processor(
87
+ images=image_content,
88
+ text=self.prompt,
89
+ return_tensors="pt",
90
+ ).to(self.attributes.device)
91
+
92
+ def _generate_caption(self, inputs: dict) -> torch.Tensor:
93
+ """Generates caption using the model.
94
+
95
+ Args:
96
+ inputs (dict): Processed model inputs for the processor, including input IDs of the image and prompt
97
+
98
+ Returns:
99
+ GeneratedCaptionOutput: A structured output containing:
100
+ - sequences: tensor with token IDs of the generated sequence
101
+ - scores: tuple of tensors with token prediction scores for each generation step
102
+ - logits: optional tensor with raw logits (None in this configuration)
103
+ - attentions: optional attention weights (None in this configuration)
104
+ - hidden_states: optional hidden states (None in this configuration)
105
+ - past_key_values: tuple of tensors containing past keys/values for attention mechanism
106
+
107
+ Configuration parameters:
108
+ - max_new_tokens: Maximum number of new tokens to generate
109
+ - return_dict_in_generate: Returns output as a structured dictionary
110
+ - output_scores: Includes prediction scores in the output
111
+ """
112
+ with torch.no_grad():
113
+ return self.model.generate(
114
+ **inputs,
115
+ max_new_tokens=self.attributes.max_new_tokens,
116
+ return_dict_in_generate=True,
117
+ output_scores=True,
118
+ )
119
+
120
+ @staticmethod
121
+ def _calculate_confidence_score(outputs: GenerateOutput) -> float:
122
+ """Calculates the confidence score from model generation outputs.
123
+
124
+ The confidence score is computed as the mean of the highest probability
125
+ for each generated token in the sequence.
126
+
127
+ Args:
128
+ outputs (GenerateOutput): Model generation output containing scores
129
+ for each generated token
130
+
131
+ Returns:
132
+ float: Average confidence score across all generated tokens
133
+ """
134
+ scores = torch.stack(outputs.scores)
135
+ probs = torch.softmax(scores, dim=-1)
136
+ token_confidences = torch.max(probs, dim=-1).values
137
+ return float(torch.mean(token_confidences).cpu())
138
+
139
+ def _decode_caption(self, outputs: GenerateOutput, input_len: int) -> str:
140
+ """Decodes the model output sequences into readable caption text.
141
+
142
+ Args:
143
+ outputs (GenerateOutput): Model generation output containing the
144
+ generated token sequences
145
+ input_len (int): Length of the input sequence to skip initial tokens
146
+
147
+ Returns:
148
+ str: Decoded caption text with special tokens removed
149
+ """
150
+ return self.processor.decode(outputs.sequences[0][input_len:], skip_special_tokens=True)
151
+
152
+ def _create_annotation(
153
+ self, caption: str, confidence: float, image_shape: tuple[int, ...]
154
+ ) -> list[ImageAnnotations]:
155
+ """Creates image annotations from the generated caption.
156
+
157
+ Args:
158
+ caption (str): Generated caption text
159
+ confidence (float): Confidence score for the prediction
160
+ image_shape (tuple[int, ...]): Shape of the input image
161
+
162
+ Returns:
163
+ list[ImageAnnotations]: List containing annotation with caption information
164
+ """
165
+
166
+ _, _ = self, image_shape
167
+ return [ImageAnnotations(text=caption, confidence_score=confidence)]
168
+
169
+ def _process_single_image(self, image_packet: ImagePacket) -> None:
170
+ """Processes a single image through the inference pipeline.
171
+
172
+ Args:
173
+ image_packet (ImagePacket): Container with image data and metadata
174
+
175
+ Returns:
176
+ None: Modifies the image_packet in place by adding annotations
177
+ """
178
+ inputs = self._prepare_inputs(image_packet.content)
179
+ outputs = self._generate_caption(inputs)
180
+ input_len = inputs[self.INPUT_IDS].shape[-1]
181
+ caption = self._decode_caption(outputs, input_len)
182
+ confidence = self._calculate_confidence_score(outputs)
183
+ annotations = self._create_annotation(caption, confidence, image_packet.content.shape)
184
+ image_packet.annotations.extend(annotations)
185
+
186
+ def _format_text_for_prompt(self, text: str) -> str:
187
+ """Formats the incoming text appropriately for the current task type.
188
+ Base implementation returns the text as-is, subclasses may override
189
+ to apply task-specific formatting.
190
+ Args:
191
+ text (str): Raw text content
192
+ Returns:
193
+ str: Formatted prompt text
194
+ """
195
+ _ = self
196
+ return text
197
+
198
+ def process_from_text_packet(self, container: DataContainer) -> None:
199
+ """
200
+ Extract prompts from the received list of text packets and use them to perform inference in each received image
201
+ packet.
202
+
203
+ Args:
204
+ container (DataContainer): Data-container with text and image packets to be processed.
205
+ """
206
+ for text_packet in container.texts:
207
+ self.prompt = self._format_text_for_prompt(text_packet.content)
208
+ if container.images:
209
+ for image_packet in container.images:
210
+ self._process_single_image(image_packet)
211
+
212
+ def process_from_prompt(self, container: DataContainer) -> None:
213
+ """
214
+ Perform inference in each received image packet using the prompt defined in template attributes.
215
+
216
+ Args:
217
+ container (DataContainer): Data-container with image packets to be processed.
218
+ """
219
+ if container.images:
220
+ for image_packet in container.images:
221
+ self._process_single_image(image_packet)
222
+
223
+ def execute(self, container: DataContainer) -> DataContainer:
224
+ """Executes the inference pipeline on a batch of images.
225
+
226
+ If text packets are present, uses each text as input for prompt formatting.
227
+ If no text packets exist, uses the default prompt from attributes.
228
+
229
+ Args:
230
+ container (DataContainer): Container with text and image packets
231
+
232
+ Returns:
233
+ DataContainer: Processed container with added annotations
234
+ """
235
+ if container.texts:
236
+ self.process_from_text_packet(container)
237
+ else:
238
+ self.process_from_prompt(container)
239
+
240
+ return container
241
+
242
+ @staticmethod
243
+ def create_bbox_annotation(coords: tuple[float, ...], label: str, confidence: float) -> ImageAnnotations:
244
+ """Creates bounding box annotation from coordinates and metadata.
245
+
246
+ Args:
247
+ coords (tuple[float, ...]): Coordinates (x0, y0, x1, y1)
248
+ label (str): Label for the detected object
249
+ confidence (float): Confidence score for the detection
250
+
251
+ Returns:
252
+ ImageAnnotations: Annotation object with bounding box information
253
+ """
254
+ x0, y0, x1, y1 = coords
255
+ x, y, w, h = bbox_xyxy_to_xywh([x0, y0, x1, y1])
256
+ return ImageAnnotations(
257
+ label_str=label,
258
+ confidence_score=confidence,
259
+ bbox=BoundingBox(x=x, y=y, w=w, h=h),
260
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  import numpy as np
4
4
  from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket
5
- from sinapsis_core.template_base.template import TemplateAttributeType
5
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
6
6
 
7
7
  from sinapsis_huggingface_transformers.templates.base_transformers import TransformersBase
8
8
 
@@ -39,6 +39,7 @@ class SpeechToTextTransformers(TransformersBase):
39
39
  def __init__(self, attributes: TemplateAttributeType) -> None:
40
40
  super().__init__(attributes)
41
41
  self.task = "automatic-speech-recognition"
42
+ self.setup_pipeline()
42
43
 
43
44
  def transformation_method(self, container: DataContainer) -> DataContainer:
44
45
  """Speech recognition (speech-to-text) using a Transformers Pipeline.
@@ -1,7 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
3
  from sinapsis_core.data_containers.data_packet import DataContainer
4
- from sinapsis_core.template_base import TemplateAttributeType
4
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
5
5
 
6
6
  from sinapsis_huggingface_transformers.templates.base_transformers import TransformersBase
7
7
 
@@ -38,6 +38,7 @@ class SummarizationTransformers(TransformersBase):
38
38
  def __init__(self, attributes: TemplateAttributeType) -> None:
39
39
  super().__init__(attributes)
40
40
  self.task = "summarization"
41
+ self.setup_pipeline()
41
42
 
42
43
  def transformation_method(self, container: DataContainer) -> DataContainer:
43
44
  """Summarize text using a Transformers Pipeline.
@@ -3,7 +3,7 @@
3
3
  import numpy as np
4
4
  import torch
5
5
  from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, TextPacket
6
- from sinapsis_core.template_base import TemplateAttributeType
6
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
7
7
 
8
8
  from sinapsis_huggingface_transformers.helpers import sentences_to_n_words, split_text_into_sentences
9
9
  from sinapsis_huggingface_transformers.templates.base_transformers import (
@@ -64,8 +64,9 @@ class TextToSpeechTransformers(TransformersBase):
64
64
 
65
65
  def __init__(self, attributes: TemplateAttributeType) -> None:
66
66
  super().__init__(attributes)
67
- self.sample_rate = self._get_sample_rate()
68
67
  self.task = "text-to-speech"
68
+ self.setup_pipeline()
69
+ self.sample_rate = self._get_sample_rate()
69
70
 
70
71
  def _get_sample_rate(self) -> int:
71
72
  """Retrieve the sample rate for the generated audio.
@@ -2,7 +2,7 @@
2
2
 
3
3
 
4
4
  from sinapsis_core.data_containers.data_packet import DataContainer
5
- from sinapsis_core.template_base import TemplateAttributeType
5
+ from sinapsis_core.template_base.base_models import TemplateAttributeType
6
6
 
7
7
  from sinapsis_huggingface_transformers.templates.base_transformers import (
8
8
  TransformersBase,
@@ -56,6 +56,7 @@ class TranslationTransformers(TransformersBase):
56
56
  def __init__(self, attributes: TemplateAttributeType) -> None:
57
57
  super().__init__(attributes)
58
58
  self.task = f"translation_{self.attributes.source_language}_to_{self.attributes.target_language}"
59
+ self.setup_pipeline()
59
60
 
60
61
  def transformation_method(self, container: DataContainer) -> DataContainer:
61
62
  """Translate text using a Transformers Pipeline.
@@ -0,0 +1,70 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ The constants and methods declared in this file are inspired in the following source:
4
+
5
+ https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
6
+
7
+ which is Licensed under the Apache License, Version 2.0.
8
+
9
+ """
10
+
11
+ import numpy as np
12
+ import regex as re
13
+
14
+ COORDS_PATTERN: str = r"<loc(?P<y0>\d\d\d\d)><loc(?P<x0>\d\d\d\d)><loc(?P<y1>\d\d\d\d)><loc(?P<x1>\d\d\d\d)>"
15
+ LABEL_PATTERN: str = r" (?P<label>.+?)( ;|$)"
16
+
17
+ DETECTION_PATTERN: str = COORDS_PATTERN + LABEL_PATTERN
18
+
19
+ LOCATION_KEYS: tuple[str, ...] = ("y0", "x0", "y1", "x1")
20
+ LOCATION_SCALE: float = 1024.0
21
+
22
+
23
+ def parse_location_tokens(match_coord: re.Match, image_shape: tuple[int, ...]) -> np.ndarray:
24
+ """Parses location tokens from model output into normalized coordinates.
25
+
26
+ Args:
27
+ match_coord (dict): Dictionary containing matched location tokens
28
+ image_shape (tuple[int, ...]): Shape of the input image
29
+
30
+ Returns:
31
+ np.ndarray: Normalized coordinates (x0, y0, x1, y1)
32
+ """
33
+ match_dict = match_coord.groupdict()
34
+ x0 = float(match_dict[LOCATION_KEYS[1]]) / LOCATION_SCALE * image_shape[1]
35
+ y0 = float(match_dict[LOCATION_KEYS[0]]) / LOCATION_SCALE * image_shape[0]
36
+ x1 = float(match_dict[LOCATION_KEYS[3]]) / LOCATION_SCALE * image_shape[1]
37
+ y1 = float(match_dict[LOCATION_KEYS[2]]) / LOCATION_SCALE * image_shape[0]
38
+ return np.array([x0, y0, x1, y1])
39
+
40
+
41
+ def parse_label(match_coord: re.Match) -> str:
42
+ """
43
+ Retrieves detection label from a regex Match object.
44
+
45
+
46
+ Args:
47
+ match_coord (Match): The Match object containing the label information.
48
+
49
+ Returns:
50
+ str: The detection label.
51
+ """
52
+ label = match_coord.groupdict().get("label")
53
+ if label is None:
54
+ return ""
55
+ return label.strip()
56
+
57
+
58
+ def get_matches(caption: str) -> re.Scanner:
59
+ """
60
+ Creates an iterable containing all the detection matches found in the
61
+ produced model caption.
62
+
63
+ Args:
64
+ caption (str): The caption produced by the paligemma model.
65
+
66
+ Returns:
67
+ Scanner: An iterable object containing all the regex matches.
68
+ """
69
+
70
+ return re.finditer(DETECTION_PATTERN, caption)