sinapsis-huggingface 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sinapsis_huggingface-0.2.1.dist-info/METADATA +297 -0
- {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/RECORD +26 -21
- {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/WHEEL +1 -1
- sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/base_diffusers.py +2 -2
- sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/image_to_image_diffusers.py +1 -1
- sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/image_to_video_gen_xl_diffusers.py +1 -1
- sinapsis_huggingface_diffusers/src/sinapsis_huggingface_diffusers/templates/inpainting_diffusers.py +1 -1
- sinapsis_huggingface_embeddings/src/sinapsis_huggingface_embeddings/templates/hugging_face_embedding_extractor.py +2 -2
- sinapsis_huggingface_embeddings/src/sinapsis_huggingface_embeddings/templates/speaker_embedding_from_audio.py +2 -1
- sinapsis_huggingface_embeddings/src/sinapsis_huggingface_embeddings/templates/speaker_embedding_from_dataset.py +2 -4
- sinapsis_huggingface_grounding_dino/src/sinapsis_huggingface_grounding_dino/templates/grounding_dino.py +2 -1
- sinapsis_huggingface_grounding_dino/src/sinapsis_huggingface_grounding_dino/templates/grounding_dino_classification.py +0 -7
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/__init__.py +3 -0
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/base_transformers.py +13 -2
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/image_to_text_transformers.py +2 -1
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_base.py +97 -0
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_detection.py +124 -0
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/pali_gemma/pali_gemma_inference.py +260 -0
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/speech_to_text_transformers.py +2 -1
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/summarization_transformers.py +2 -1
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/text_to_speech_transformers.py +3 -2
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/templates/translation_transformers.py +2 -1
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/__init__.py +0 -0
- sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/helpers.py +70 -0
- sinapsis_huggingface-0.1.0.dist-info/METADATA +0 -921
- {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sinapsis_huggingface-0.1.0.dist-info → sinapsis_huggingface-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from sinapsis_core.data_containers.annotations import ImageAnnotations
|
|
5
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
6
|
+
from sinapsis_huggingface_transformers.templates.pali_gemma.pali_gemma_inference import (
|
|
7
|
+
PaliGemmaInference,
|
|
8
|
+
PaliGemmaInferenceAttributes,
|
|
9
|
+
)
|
|
10
|
+
from sinapsis_huggingface_transformers.thirdparty.helpers import (
|
|
11
|
+
get_matches,
|
|
12
|
+
parse_label,
|
|
13
|
+
parse_location_tokens,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class PaliGemmaDetectionKeys:
|
|
19
|
+
"Keys to use during detection"
|
|
20
|
+
|
|
21
|
+
detection_prompt: str = "detect {}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PaliGemmaDetectionAttributes(PaliGemmaInferenceAttributes):
|
|
25
|
+
"""Configuration attributes for PaliGemma object detection tasks.
|
|
26
|
+
|
|
27
|
+
This class extends the base inference attributes to handle object detection specific configurations.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
objects_to_detect (str | list[str]): Target objects to detect, can be a single string or list of strings
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
objects_to_detect: str | list[str]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PaliGemmaDetection(PaliGemmaInference):
|
|
37
|
+
"""Implementation of PaliGemma object detection pipeline.
|
|
38
|
+
|
|
39
|
+
The template inherits functionality from its base class, extending
|
|
40
|
+
the functionality to run inference on an image and to identify
|
|
41
|
+
the objects from the attributes.
|
|
42
|
+
|
|
43
|
+
Usage example:
|
|
44
|
+
|
|
45
|
+
agent:
|
|
46
|
+
name: my_test_agent
|
|
47
|
+
templates:
|
|
48
|
+
- template_name: InputTemplate
|
|
49
|
+
class_name: InputTemplate
|
|
50
|
+
attributes: {}
|
|
51
|
+
- template_name: PaliGemmaDetection
|
|
52
|
+
class_name: PaliGemmaDetection
|
|
53
|
+
template_input: InputTemplate
|
|
54
|
+
attributes:
|
|
55
|
+
model_path: '/path/to/paligemma/model'
|
|
56
|
+
processor_path: '`/path/to/processor'
|
|
57
|
+
model_cache_dir: /path/to/cache/dir
|
|
58
|
+
device: 'cuda'
|
|
59
|
+
max_new_tokens: 200
|
|
60
|
+
torch_dtype: float16
|
|
61
|
+
prompt: <image> caption en
|
|
62
|
+
objects_to_detect: 'object to detect'
|
|
63
|
+
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
AttributesBaseModel = PaliGemmaDetectionAttributes
|
|
67
|
+
KEYS = PaliGemmaDetectionKeys
|
|
68
|
+
|
|
69
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
70
|
+
super().__init__(attributes)
|
|
71
|
+
|
|
72
|
+
objects_str = self.initialize_objects_str()
|
|
73
|
+
self.prompt = self.KEYS.detection_prompt.format(objects_str)
|
|
74
|
+
|
|
75
|
+
def initialize_objects_str(self) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Initialize the objects to detect string according to the specified format.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
str: String enlisting the objects to be defined in the detection prompt.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
if isinstance(self.attributes.objects_to_detect, str):
|
|
84
|
+
return self.attributes.objects_to_detect
|
|
85
|
+
return "; ".join(self.attributes.objects_to_detect)
|
|
86
|
+
|
|
87
|
+
def _format_text_for_prompt(self, text: str) -> str:
|
|
88
|
+
"""Formats input text as a detection prompt.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
text (str): Raw text content (expected to be objects to detect)
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
str: Formatted detection prompt
|
|
95
|
+
"""
|
|
96
|
+
return self.KEYS.detection_prompt.format(text)
|
|
97
|
+
|
|
98
|
+
def _create_annotation(
|
|
99
|
+
self, caption: str, confidence: float, image_shape: tuple[int, ...]
|
|
100
|
+
) -> list[ImageAnnotations]:
|
|
101
|
+
"""Creates structured annotations from detection model outputs.
|
|
102
|
+
|
|
103
|
+
Processes the model's output caption to extract bounding box coordinates
|
|
104
|
+
and object labels for each detected instance.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
caption (str): Raw detection output from the model
|
|
108
|
+
confidence (float): Confidence score for the predictions
|
|
109
|
+
image_shape (tuple[int, ...]): Dimensions of the input image (height, width)
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
list[ImageAnnotations]: List of annotations containing bounding boxes and labels
|
|
113
|
+
for each detected object
|
|
114
|
+
"""
|
|
115
|
+
annotations = []
|
|
116
|
+
matches = get_matches(caption)
|
|
117
|
+
|
|
118
|
+
for match_coord in matches:
|
|
119
|
+
coords = parse_location_tokens(match_coord, image_shape)
|
|
120
|
+
label = parse_label(match_coord)
|
|
121
|
+
annotation = self.create_bbox_annotation(coords, label, confidence)
|
|
122
|
+
annotations.append(annotation)
|
|
123
|
+
|
|
124
|
+
return annotations
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import numpy as np
|
|
3
|
+
import torch
|
|
4
|
+
from sinapsis_core.data_containers.annotations import BoundingBox, ImageAnnotations
|
|
5
|
+
from sinapsis_core.data_containers.data_packet import DataContainer, ImagePacket
|
|
6
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
7
|
+
from sinapsis_data_visualization.helpers.detection_utils import bbox_xyxy_to_xywh
|
|
8
|
+
from sinapsis_huggingface_transformers.templates.pali_gemma.pali_gemma_base import (
|
|
9
|
+
PaliGemmaBase,
|
|
10
|
+
PaliGemmaBaseAttributes,
|
|
11
|
+
)
|
|
12
|
+
from transformers.generation.utils import GenerateOutput
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PaliGemmaInferenceAttributes(PaliGemmaBaseAttributes):
|
|
16
|
+
"""Configuration attributes for PaliGemma inference.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
prompt (str): Prompt to run the inference (default: "<image>caption en")
|
|
20
|
+
|
|
21
|
+
The <image> token is essential as it serves as a marker that tells the model where to look at the image
|
|
22
|
+
when processing the input. This token enables the model to understand the relationship between the visual
|
|
23
|
+
and textual components during processing.
|
|
24
|
+
|
|
25
|
+
Example prompts:
|
|
26
|
+
- "<image>caption en" -> Generates a basic caption in English
|
|
27
|
+
- "<image>What objects can you see in this image?" -> Lists objects in the image
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
prompt: str = "<image>caption en"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PaliGemmaInference(PaliGemmaBase):
|
|
34
|
+
"""Implementation of PaliGemma inference pipeline for image processing and caption generation.
|
|
35
|
+
|
|
36
|
+
This class handles the inference process for PaliGemma models, including image processing,
|
|
37
|
+
caption generation, and annotation creation. It supports both basic captioning and
|
|
38
|
+
detection/segmentation tasks.
|
|
39
|
+
|
|
40
|
+
Usage example:
|
|
41
|
+
|
|
42
|
+
agent:
|
|
43
|
+
name: my_test_agent
|
|
44
|
+
templates:
|
|
45
|
+
- template_name: InputTemplate
|
|
46
|
+
class_name: InputTemplate
|
|
47
|
+
attributes: {}
|
|
48
|
+
- template_name: PaliGemmaInference
|
|
49
|
+
class_name: PaliGemmaInference
|
|
50
|
+
template_input: InputTemplate
|
|
51
|
+
attributes:
|
|
52
|
+
model_path: '/path/to/paligemma/model'
|
|
53
|
+
processor_path: '`/path/to/processor'
|
|
54
|
+
model_cache_dir: /path/to/cache/dir
|
|
55
|
+
device: 'cuda'
|
|
56
|
+
max_new_tokens: 200
|
|
57
|
+
torch_dtype: float16
|
|
58
|
+
prompt: <image> caption en
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
AttributesBaseModel = PaliGemmaInferenceAttributes
|
|
63
|
+
INPUT_IDS = "input_ids"
|
|
64
|
+
|
|
65
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
66
|
+
super().__init__(attributes)
|
|
67
|
+
self.prompt = self.attributes.prompt
|
|
68
|
+
|
|
69
|
+
def _prepare_inputs(self, image_content: np.ndarray) -> dict:
|
|
70
|
+
"""Prepares the input for model inference by processing the image and text prompt.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
image_content (np.ndarray): Raw image content to be processed as a numpy array
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
dict: Processed inputs containing:
|
|
77
|
+
- input_ids (torch.Tensor): Token IDs for the text prompt and image tokens
|
|
78
|
+
- attention_mask (torch.Tensor): Binary mask indicating valid input positions (1s)
|
|
79
|
+
- pixel_values (torch.Tensor): Processed image tensor with normalized pixel values
|
|
80
|
+
in shape (batch_size, channels, height, width)
|
|
81
|
+
|
|
82
|
+
Note:
|
|
83
|
+
- The format of the returns it's because uses PyTorch tensors (return_tensors="pt")
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
return self.processor(
|
|
87
|
+
images=image_content,
|
|
88
|
+
text=self.prompt,
|
|
89
|
+
return_tensors="pt",
|
|
90
|
+
).to(self.attributes.device)
|
|
91
|
+
|
|
92
|
+
def _generate_caption(self, inputs: dict) -> torch.Tensor:
|
|
93
|
+
"""Generates caption using the model.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
inputs (dict): Processed model inputs for the processor, including input IDs of the image and prompt
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
GeneratedCaptionOutput: A structured output containing:
|
|
100
|
+
- sequences: tensor with token IDs of the generated sequence
|
|
101
|
+
- scores: tuple of tensors with token prediction scores for each generation step
|
|
102
|
+
- logits: optional tensor with raw logits (None in this configuration)
|
|
103
|
+
- attentions: optional attention weights (None in this configuration)
|
|
104
|
+
- hidden_states: optional hidden states (None in this configuration)
|
|
105
|
+
- past_key_values: tuple of tensors containing past keys/values for attention mechanism
|
|
106
|
+
|
|
107
|
+
Configuration parameters:
|
|
108
|
+
- max_new_tokens: Maximum number of new tokens to generate
|
|
109
|
+
- return_dict_in_generate: Returns output as a structured dictionary
|
|
110
|
+
- output_scores: Includes prediction scores in the output
|
|
111
|
+
"""
|
|
112
|
+
with torch.no_grad():
|
|
113
|
+
return self.model.generate(
|
|
114
|
+
**inputs,
|
|
115
|
+
max_new_tokens=self.attributes.max_new_tokens,
|
|
116
|
+
return_dict_in_generate=True,
|
|
117
|
+
output_scores=True,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _calculate_confidence_score(outputs: GenerateOutput) -> float:
|
|
122
|
+
"""Calculates the confidence score from model generation outputs.
|
|
123
|
+
|
|
124
|
+
The confidence score is computed as the mean of the highest probability
|
|
125
|
+
for each generated token in the sequence.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
outputs (GenerateOutput): Model generation output containing scores
|
|
129
|
+
for each generated token
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
float: Average confidence score across all generated tokens
|
|
133
|
+
"""
|
|
134
|
+
scores = torch.stack(outputs.scores)
|
|
135
|
+
probs = torch.softmax(scores, dim=-1)
|
|
136
|
+
token_confidences = torch.max(probs, dim=-1).values
|
|
137
|
+
return float(torch.mean(token_confidences).cpu())
|
|
138
|
+
|
|
139
|
+
def _decode_caption(self, outputs: GenerateOutput, input_len: int) -> str:
|
|
140
|
+
"""Decodes the model output sequences into readable caption text.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
outputs (GenerateOutput): Model generation output containing the
|
|
144
|
+
generated token sequences
|
|
145
|
+
input_len (int): Length of the input sequence to skip initial tokens
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
str: Decoded caption text with special tokens removed
|
|
149
|
+
"""
|
|
150
|
+
return self.processor.decode(outputs.sequences[0][input_len:], skip_special_tokens=True)
|
|
151
|
+
|
|
152
|
+
def _create_annotation(
|
|
153
|
+
self, caption: str, confidence: float, image_shape: tuple[int, ...]
|
|
154
|
+
) -> list[ImageAnnotations]:
|
|
155
|
+
"""Creates image annotations from the generated caption.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
caption (str): Generated caption text
|
|
159
|
+
confidence (float): Confidence score for the prediction
|
|
160
|
+
image_shape (tuple[int, ...]): Shape of the input image
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
list[ImageAnnotations]: List containing annotation with caption information
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
_, _ = self, image_shape
|
|
167
|
+
return [ImageAnnotations(text=caption, confidence_score=confidence)]
|
|
168
|
+
|
|
169
|
+
def _process_single_image(self, image_packet: ImagePacket) -> None:
|
|
170
|
+
"""Processes a single image through the inference pipeline.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
image_packet (ImagePacket): Container with image data and metadata
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
None: Modifies the image_packet in place by adding annotations
|
|
177
|
+
"""
|
|
178
|
+
inputs = self._prepare_inputs(image_packet.content)
|
|
179
|
+
outputs = self._generate_caption(inputs)
|
|
180
|
+
input_len = inputs[self.INPUT_IDS].shape[-1]
|
|
181
|
+
caption = self._decode_caption(outputs, input_len)
|
|
182
|
+
confidence = self._calculate_confidence_score(outputs)
|
|
183
|
+
annotations = self._create_annotation(caption, confidence, image_packet.content.shape)
|
|
184
|
+
image_packet.annotations.extend(annotations)
|
|
185
|
+
|
|
186
|
+
def _format_text_for_prompt(self, text: str) -> str:
|
|
187
|
+
"""Formats the incoming text appropriately for the current task type.
|
|
188
|
+
Base implementation returns the text as-is, subclasses may override
|
|
189
|
+
to apply task-specific formatting.
|
|
190
|
+
Args:
|
|
191
|
+
text (str): Raw text content
|
|
192
|
+
Returns:
|
|
193
|
+
str: Formatted prompt text
|
|
194
|
+
"""
|
|
195
|
+
_ = self
|
|
196
|
+
return text
|
|
197
|
+
|
|
198
|
+
def process_from_text_packet(self, container: DataContainer) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Extract prompts from the received list of text packets and use them to perform inference in each received image
|
|
201
|
+
packet.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
container (DataContainer): Data-container with text and image packets to be processed.
|
|
205
|
+
"""
|
|
206
|
+
for text_packet in container.texts:
|
|
207
|
+
self.prompt = self._format_text_for_prompt(text_packet.content)
|
|
208
|
+
if container.images:
|
|
209
|
+
for image_packet in container.images:
|
|
210
|
+
self._process_single_image(image_packet)
|
|
211
|
+
|
|
212
|
+
def process_from_prompt(self, container: DataContainer) -> None:
|
|
213
|
+
"""
|
|
214
|
+
Perform inference in each received image packet using the prompt defined in template attributes.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
container (DataContainer): Data-container with image packets to be processed.
|
|
218
|
+
"""
|
|
219
|
+
if container.images:
|
|
220
|
+
for image_packet in container.images:
|
|
221
|
+
self._process_single_image(image_packet)
|
|
222
|
+
|
|
223
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
224
|
+
"""Executes the inference pipeline on a batch of images.
|
|
225
|
+
|
|
226
|
+
If text packets are present, uses each text as input for prompt formatting.
|
|
227
|
+
If no text packets exist, uses the default prompt from attributes.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
container (DataContainer): Container with text and image packets
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
DataContainer: Processed container with added annotations
|
|
234
|
+
"""
|
|
235
|
+
if container.texts:
|
|
236
|
+
self.process_from_text_packet(container)
|
|
237
|
+
else:
|
|
238
|
+
self.process_from_prompt(container)
|
|
239
|
+
|
|
240
|
+
return container
|
|
241
|
+
|
|
242
|
+
@staticmethod
|
|
243
|
+
def create_bbox_annotation(coords: tuple[float, ...], label: str, confidence: float) -> ImageAnnotations:
|
|
244
|
+
"""Creates bounding box annotation from coordinates and metadata.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
coords (tuple[float, ...]): Coordinates (x0, y0, x1, y1)
|
|
248
|
+
label (str): Label for the detected object
|
|
249
|
+
confidence (float): Confidence score for the detection
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
ImageAnnotations: Annotation object with bounding box information
|
|
253
|
+
"""
|
|
254
|
+
x0, y0, x1, y1 = coords
|
|
255
|
+
x, y, w, h = bbox_xyxy_to_xywh([x0, y0, x1, y1])
|
|
256
|
+
return ImageAnnotations(
|
|
257
|
+
label_str=label,
|
|
258
|
+
confidence_score=confidence,
|
|
259
|
+
bbox=BoundingBox(x=x, y=y, w=w, h=h),
|
|
260
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket
|
|
5
|
-
from sinapsis_core.template_base.
|
|
5
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
6
6
|
|
|
7
7
|
from sinapsis_huggingface_transformers.templates.base_transformers import TransformersBase
|
|
8
8
|
|
|
@@ -39,6 +39,7 @@ class SpeechToTextTransformers(TransformersBase):
|
|
|
39
39
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
40
40
|
super().__init__(attributes)
|
|
41
41
|
self.task = "automatic-speech-recognition"
|
|
42
|
+
self.setup_pipeline()
|
|
42
43
|
|
|
43
44
|
def transformation_method(self, container: DataContainer) -> DataContainer:
|
|
44
45
|
"""Speech recognition (speech-to-text) using a Transformers Pipeline.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
|
|
3
3
|
from sinapsis_core.data_containers.data_packet import DataContainer
|
|
4
|
-
from sinapsis_core.template_base import TemplateAttributeType
|
|
4
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
5
5
|
|
|
6
6
|
from sinapsis_huggingface_transformers.templates.base_transformers import TransformersBase
|
|
7
7
|
|
|
@@ -38,6 +38,7 @@ class SummarizationTransformers(TransformersBase):
|
|
|
38
38
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
39
39
|
super().__init__(attributes)
|
|
40
40
|
self.task = "summarization"
|
|
41
|
+
self.setup_pipeline()
|
|
41
42
|
|
|
42
43
|
def transformation_method(self, container: DataContainer) -> DataContainer:
|
|
43
44
|
"""Summarize text using a Transformers Pipeline.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import torch
|
|
5
5
|
from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer, TextPacket
|
|
6
|
-
from sinapsis_core.template_base import TemplateAttributeType
|
|
6
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
7
7
|
|
|
8
8
|
from sinapsis_huggingface_transformers.helpers import sentences_to_n_words, split_text_into_sentences
|
|
9
9
|
from sinapsis_huggingface_transformers.templates.base_transformers import (
|
|
@@ -64,8 +64,9 @@ class TextToSpeechTransformers(TransformersBase):
|
|
|
64
64
|
|
|
65
65
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
66
66
|
super().__init__(attributes)
|
|
67
|
-
self.sample_rate = self._get_sample_rate()
|
|
68
67
|
self.task = "text-to-speech"
|
|
68
|
+
self.setup_pipeline()
|
|
69
|
+
self.sample_rate = self._get_sample_rate()
|
|
69
70
|
|
|
70
71
|
def _get_sample_rate(self) -> int:
|
|
71
72
|
"""Retrieve the sample rate for the generated audio.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
from sinapsis_core.data_containers.data_packet import DataContainer
|
|
5
|
-
from sinapsis_core.template_base import TemplateAttributeType
|
|
5
|
+
from sinapsis_core.template_base.base_models import TemplateAttributeType
|
|
6
6
|
|
|
7
7
|
from sinapsis_huggingface_transformers.templates.base_transformers import (
|
|
8
8
|
TransformersBase,
|
|
@@ -56,6 +56,7 @@ class TranslationTransformers(TransformersBase):
|
|
|
56
56
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
57
57
|
super().__init__(attributes)
|
|
58
58
|
self.task = f"translation_{self.attributes.source_language}_to_{self.attributes.target_language}"
|
|
59
|
+
self.setup_pipeline()
|
|
59
60
|
|
|
60
61
|
def transformation_method(self, container: DataContainer) -> DataContainer:
|
|
61
62
|
"""Translate text using a Transformers Pipeline.
|
sinapsis_huggingface_transformers/src/sinapsis_huggingface_transformers/thirdparty/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
The constants and methods declared in this file are inspired in the following source:
|
|
4
|
+
|
|
5
|
+
https://github.com/google/generative-ai-docs/blob/main/site/en/gemma/docs/paligemma/inference-with-keras.ipynb
|
|
6
|
+
|
|
7
|
+
which is Licensed under the Apache License, Version 2.0.
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import regex as re
|
|
13
|
+
|
|
14
|
+
COORDS_PATTERN: str = r"<loc(?P<y0>\d\d\d\d)><loc(?P<x0>\d\d\d\d)><loc(?P<y1>\d\d\d\d)><loc(?P<x1>\d\d\d\d)>"
|
|
15
|
+
LABEL_PATTERN: str = r" (?P<label>.+?)( ;|$)"
|
|
16
|
+
|
|
17
|
+
DETECTION_PATTERN: str = COORDS_PATTERN + LABEL_PATTERN
|
|
18
|
+
|
|
19
|
+
LOCATION_KEYS: tuple[str, ...] = ("y0", "x0", "y1", "x1")
|
|
20
|
+
LOCATION_SCALE: float = 1024.0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_location_tokens(match_coord: re.Match, image_shape: tuple[int, ...]) -> np.ndarray:
|
|
24
|
+
"""Parses location tokens from model output into normalized coordinates.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
match_coord (dict): Dictionary containing matched location tokens
|
|
28
|
+
image_shape (tuple[int, ...]): Shape of the input image
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
np.ndarray: Normalized coordinates (x0, y0, x1, y1)
|
|
32
|
+
"""
|
|
33
|
+
match_dict = match_coord.groupdict()
|
|
34
|
+
x0 = float(match_dict[LOCATION_KEYS[1]]) / LOCATION_SCALE * image_shape[1]
|
|
35
|
+
y0 = float(match_dict[LOCATION_KEYS[0]]) / LOCATION_SCALE * image_shape[0]
|
|
36
|
+
x1 = float(match_dict[LOCATION_KEYS[3]]) / LOCATION_SCALE * image_shape[1]
|
|
37
|
+
y1 = float(match_dict[LOCATION_KEYS[2]]) / LOCATION_SCALE * image_shape[0]
|
|
38
|
+
return np.array([x0, y0, x1, y1])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_label(match_coord: re.Match) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Retrieves detection label from a regex Match object.
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
match_coord (Match): The Match object containing the label information.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
str: The detection label.
|
|
51
|
+
"""
|
|
52
|
+
label = match_coord.groupdict().get("label")
|
|
53
|
+
if label is None:
|
|
54
|
+
return ""
|
|
55
|
+
return label.strip()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_matches(caption: str) -> re.Scanner:
|
|
59
|
+
"""
|
|
60
|
+
Creates an iterable containing all the detection matches found in the
|
|
61
|
+
produced model caption.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
caption (str): The caption produced by the paligemma model.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Scanner: An iterable object containing all the regex matches.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
return re.finditer(DETECTION_PATTERN, caption)
|