vision-agent 0.2.123__py3-none-any.whl → 0.2.124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/lmm/lmm.py +26 -136
- vision_agent/tools/__init__.py +5 -3
- vision_agent/tools/tool_utils.py +2 -2
- vision_agent/tools/tools.py +68 -3
- vision_agent/tools/tools_types.py +2 -2
- vision_agent/utils/image_utils.py +47 -0
- {vision_agent-0.2.123.dist-info → vision_agent-0.2.124.dist-info}/METADATA +1 -1
- {vision_agent-0.2.123.dist-info → vision_agent-0.2.124.dist-info}/RECORD +10 -10
- {vision_agent-0.2.123.dist-info → vision_agent-0.2.124.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.123.dist-info → vision_agent-0.2.124.dist-info}/WHEEL +0 -0
vision_agent/lmm/lmm.py
CHANGED
@@ -1,77 +1,36 @@
|
|
1
|
-
import base64
|
2
|
-
import io
|
3
1
|
import json
|
4
2
|
import logging
|
5
3
|
import os
|
6
4
|
from abc import ABC, abstractmethod
|
7
5
|
from pathlib import Path
|
8
|
-
from typing import Any,
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
|
9
7
|
|
10
8
|
import anthropic
|
11
9
|
import requests
|
12
10
|
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
|
13
11
|
from openai import AzureOpenAI, OpenAI
|
14
|
-
from PIL import Image
|
15
12
|
|
16
|
-
|
17
|
-
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
13
|
+
from vision_agent.utils.image_utils import encode_media
|
18
14
|
|
19
15
|
from .types import Message
|
20
16
|
|
21
17
|
_LOGGER = logging.getLogger(__name__)
|
22
18
|
|
23
19
|
|
24
|
-
def encode_image_bytes(image: bytes) -> str:
|
25
|
-
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
|
26
|
-
buffer = io.BytesIO()
|
27
|
-
image.save(buffer, format="PNG") # type: ignore
|
28
|
-
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
29
|
-
return encoded_image
|
30
|
-
|
31
|
-
|
32
|
-
def encode_media(media: Union[str, Path]) -> str:
|
33
|
-
if type(media) is str and media.startswith(("http", "https")):
|
34
|
-
# for mp4 video url, we assume there is a same url but ends with png
|
35
|
-
# vision-agent-ui will upload this png when uploading the video
|
36
|
-
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
37
|
-
return media[:-4] + ".png"
|
38
|
-
return media
|
39
|
-
extension = "png"
|
40
|
-
extension = Path(media).suffix
|
41
|
-
if extension.lower() not in {
|
42
|
-
".jpg",
|
43
|
-
".jpeg",
|
44
|
-
".png",
|
45
|
-
".webp",
|
46
|
-
".bmp",
|
47
|
-
".mp4",
|
48
|
-
".mov",
|
49
|
-
}:
|
50
|
-
raise ValueError(f"Unsupported image extension: {extension}")
|
51
|
-
|
52
|
-
image_bytes = b""
|
53
|
-
if extension.lower() in {".mp4", ".mov"}:
|
54
|
-
frames = T.extract_frames(media)
|
55
|
-
image = frames[len(frames) // 2]
|
56
|
-
buffer = io.BytesIO()
|
57
|
-
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
|
58
|
-
image_bytes = buffer.getvalue()
|
59
|
-
else:
|
60
|
-
image_bytes = open(media, "rb").read()
|
61
|
-
return encode_image_bytes(image_bytes)
|
62
|
-
|
63
|
-
|
64
20
|
class LMM(ABC):
|
65
21
|
@abstractmethod
|
66
22
|
def generate(
|
67
|
-
self,
|
23
|
+
self,
|
24
|
+
prompt: str,
|
25
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
26
|
+
**kwargs: Any,
|
68
27
|
) -> Union[str, Iterator[Optional[str]]]:
|
69
28
|
pass
|
70
29
|
|
71
30
|
@abstractmethod
|
72
31
|
def chat(
|
73
32
|
self,
|
74
|
-
chat:
|
33
|
+
chat: Sequence[Message],
|
75
34
|
**kwargs: Any,
|
76
35
|
) -> Union[str, Iterator[Optional[str]]]:
|
77
36
|
pass
|
@@ -79,7 +38,7 @@ class LMM(ABC):
|
|
79
38
|
@abstractmethod
|
80
39
|
def __call__(
|
81
40
|
self,
|
82
|
-
input: Union[str,
|
41
|
+
input: Union[str, Sequence[Message]],
|
83
42
|
**kwargs: Any,
|
84
43
|
) -> Union[str, Iterator[Optional[str]]]:
|
85
44
|
pass
|
@@ -111,7 +70,7 @@ class OpenAILMM(LMM):
|
|
111
70
|
|
112
71
|
def __call__(
|
113
72
|
self,
|
114
|
-
input: Union[str,
|
73
|
+
input: Union[str, Sequence[Message]],
|
115
74
|
**kwargs: Any,
|
116
75
|
) -> Union[str, Iterator[Optional[str]]]:
|
117
76
|
if isinstance(input, str):
|
@@ -120,13 +79,13 @@ class OpenAILMM(LMM):
|
|
120
79
|
|
121
80
|
def chat(
|
122
81
|
self,
|
123
|
-
chat:
|
82
|
+
chat: Sequence[Message],
|
124
83
|
**kwargs: Any,
|
125
84
|
) -> Union[str, Iterator[Optional[str]]]:
|
126
85
|
"""Chat with the LMM model.
|
127
86
|
|
128
87
|
Parameters:
|
129
|
-
chat (
|
88
|
+
chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
|
130
89
|
messages. The messages can be in the format:
|
131
90
|
[{"role": "user", "content": "Hello!"}, ...]
|
132
91
|
or if it contains media, it should be in the format:
|
@@ -147,6 +106,7 @@ class OpenAILMM(LMM):
|
|
147
106
|
"url": (
|
148
107
|
encoded_media
|
149
108
|
if encoded_media.startswith(("http", "https"))
|
109
|
+
or encoded_media.startswith("data:image/")
|
150
110
|
else f"data:image/png;base64,{encoded_media}"
|
151
111
|
),
|
152
112
|
"detail": "low",
|
@@ -174,7 +134,7 @@ class OpenAILMM(LMM):
|
|
174
134
|
def generate(
|
175
135
|
self,
|
176
136
|
prompt: str,
|
177
|
-
media: Optional[
|
137
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
178
138
|
**kwargs: Any,
|
179
139
|
) -> Union[str, Iterator[Optional[str]]]:
|
180
140
|
message: List[Dict[str, Any]] = [
|
@@ -192,7 +152,12 @@ class OpenAILMM(LMM):
|
|
192
152
|
{
|
193
153
|
"type": "image_url",
|
194
154
|
"image_url": {
|
195
|
-
"url":
|
155
|
+
"url": (
|
156
|
+
encoded_media
|
157
|
+
if encoded_media.startswith(("http", "https"))
|
158
|
+
or encoded_media.startswith("data:image/")
|
159
|
+
else f"data:image/png;base64,{encoded_media}"
|
160
|
+
),
|
196
161
|
"detail": "low",
|
197
162
|
},
|
198
163
|
},
|
@@ -214,81 +179,6 @@ class OpenAILMM(LMM):
|
|
214
179
|
else:
|
215
180
|
return cast(str, response.choices[0].message.content)
|
216
181
|
|
217
|
-
def generate_classifier(self, question: str) -> Callable:
|
218
|
-
api_doc = T.get_tool_documentation([T.clip])
|
219
|
-
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
220
|
-
response = self.client.chat.completions.create(
|
221
|
-
model=self.model_name,
|
222
|
-
messages=[
|
223
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
224
|
-
{"role": "user", "content": prompt},
|
225
|
-
],
|
226
|
-
response_format={"type": "json_object"},
|
227
|
-
)
|
228
|
-
|
229
|
-
try:
|
230
|
-
params = json.loads(cast(str, response.choices[0].message.content))[
|
231
|
-
"Parameters"
|
232
|
-
]
|
233
|
-
except json.JSONDecodeError:
|
234
|
-
_LOGGER.error(
|
235
|
-
f"Failed to decode response: {response.choices[0].message.content}"
|
236
|
-
)
|
237
|
-
raise ValueError("Failed to decode response")
|
238
|
-
|
239
|
-
return lambda x: T.clip(x, params["prompt"])
|
240
|
-
|
241
|
-
def generate_detector(self, question: str) -> Callable:
|
242
|
-
api_doc = T.get_tool_documentation([T.owl_v2])
|
243
|
-
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
244
|
-
response = self.client.chat.completions.create(
|
245
|
-
model=self.model_name,
|
246
|
-
messages=[
|
247
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
248
|
-
{"role": "user", "content": prompt},
|
249
|
-
],
|
250
|
-
response_format={"type": "json_object"},
|
251
|
-
)
|
252
|
-
|
253
|
-
try:
|
254
|
-
params = json.loads(cast(str, response.choices[0].message.content))[
|
255
|
-
"Parameters"
|
256
|
-
]
|
257
|
-
except json.JSONDecodeError:
|
258
|
-
_LOGGER.error(
|
259
|
-
f"Failed to decode response: {response.choices[0].message.content}"
|
260
|
-
)
|
261
|
-
raise ValueError("Failed to decode response")
|
262
|
-
|
263
|
-
return lambda x: T.owl_v2(params["prompt"], x)
|
264
|
-
|
265
|
-
def generate_segmentor(self, question: str) -> Callable:
|
266
|
-
api_doc = T.get_tool_documentation([T.grounding_sam])
|
267
|
-
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
268
|
-
response = self.client.chat.completions.create(
|
269
|
-
model=self.model_name,
|
270
|
-
messages=[
|
271
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
272
|
-
{"role": "user", "content": prompt},
|
273
|
-
],
|
274
|
-
response_format={"type": "json_object"},
|
275
|
-
)
|
276
|
-
|
277
|
-
try:
|
278
|
-
params = json.loads(cast(str, response.choices[0].message.content))[
|
279
|
-
"Parameters"
|
280
|
-
]
|
281
|
-
except json.JSONDecodeError:
|
282
|
-
_LOGGER.error(
|
283
|
-
f"Failed to decode response: {response.choices[0].message.content}"
|
284
|
-
)
|
285
|
-
raise ValueError("Failed to decode response")
|
286
|
-
|
287
|
-
return lambda x: T.grounding_sam(params["prompt"], x)
|
288
|
-
|
289
|
-
def generate_image_qa_tool(self, question: str) -> Callable:
|
290
|
-
return lambda x: T.git_vqa_v2(question, x)
|
291
|
-
|
292
182
|
|
293
183
|
class AzureOpenAILMM(OpenAILMM):
|
294
184
|
def __init__(
|
@@ -362,7 +252,7 @@ class OllamaLMM(LMM):
|
|
362
252
|
|
363
253
|
def __call__(
|
364
254
|
self,
|
365
|
-
input: Union[str,
|
255
|
+
input: Union[str, Sequence[Message]],
|
366
256
|
**kwargs: Any,
|
367
257
|
) -> Union[str, Iterator[Optional[str]]]:
|
368
258
|
if isinstance(input, str):
|
@@ -371,13 +261,13 @@ class OllamaLMM(LMM):
|
|
371
261
|
|
372
262
|
def chat(
|
373
263
|
self,
|
374
|
-
chat:
|
264
|
+
chat: Sequence[Message],
|
375
265
|
**kwargs: Any,
|
376
266
|
) -> Union[str, Iterator[Optional[str]]]:
|
377
267
|
"""Chat with the LMM model.
|
378
268
|
|
379
269
|
Parameters:
|
380
|
-
chat (
|
270
|
+
chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
|
381
271
|
messages. The messages can be in the format:
|
382
272
|
[{"role": "user", "content": "Hello!"}, ...]
|
383
273
|
or if it contains media, it should be in the format:
|
@@ -429,7 +319,7 @@ class OllamaLMM(LMM):
|
|
429
319
|
def generate(
|
430
320
|
self,
|
431
321
|
prompt: str,
|
432
|
-
media: Optional[
|
322
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
433
323
|
**kwargs: Any,
|
434
324
|
) -> Union[str, Iterator[Optional[str]]]:
|
435
325
|
url = f"{self.url}/generate"
|
@@ -493,7 +383,7 @@ class ClaudeSonnetLMM(LMM):
|
|
493
383
|
|
494
384
|
def __call__(
|
495
385
|
self,
|
496
|
-
input: Union[str,
|
386
|
+
input: Union[str, Sequence[Dict[str, Any]]],
|
497
387
|
**kwargs: Any,
|
498
388
|
) -> Union[str, Iterator[Optional[str]]]:
|
499
389
|
if isinstance(input, str):
|
@@ -502,7 +392,7 @@ class ClaudeSonnetLMM(LMM):
|
|
502
392
|
|
503
393
|
def chat(
|
504
394
|
self,
|
505
|
-
chat:
|
395
|
+
chat: Sequence[Dict[str, Any]],
|
506
396
|
**kwargs: Any,
|
507
397
|
) -> Union[str, Iterator[Optional[str]]]:
|
508
398
|
messages: List[MessageParam] = []
|
@@ -551,7 +441,7 @@ class ClaudeSonnetLMM(LMM):
|
|
551
441
|
def generate(
|
552
442
|
self,
|
553
443
|
prompt: str,
|
554
|
-
media: Optional[
|
444
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
555
445
|
**kwargs: Any,
|
556
446
|
) -> Union[str, Iterator[Optional[str]]]:
|
557
447
|
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
vision_agent/tools/__init__.py
CHANGED
@@ -16,6 +16,8 @@ from .tools import (
|
|
16
16
|
clip,
|
17
17
|
closest_box_distance,
|
18
18
|
closest_mask_distance,
|
19
|
+
countgd_counting,
|
20
|
+
countgd_example_based_counting,
|
19
21
|
depth_anything_v2,
|
20
22
|
detr_segmentation,
|
21
23
|
dpt_hybrid_midas,
|
@@ -30,6 +32,8 @@ from .tools import (
|
|
30
32
|
generate_soft_edge_image,
|
31
33
|
get_tool_documentation,
|
32
34
|
git_vqa_v2,
|
35
|
+
gpt4o_image_vqa,
|
36
|
+
gpt4o_video_vqa,
|
33
37
|
grounding_dino,
|
34
38
|
grounding_sam,
|
35
39
|
ixc25_image_vqa,
|
@@ -37,13 +41,11 @@ from .tools import (
|
|
37
41
|
load_image,
|
38
42
|
loca_visual_prompt_counting,
|
39
43
|
loca_zero_shot_counting,
|
40
|
-
countgd_counting,
|
41
|
-
countgd_example_based_counting,
|
42
44
|
ocr,
|
43
45
|
overlay_bounding_boxes,
|
46
|
+
overlay_counting_results,
|
44
47
|
overlay_heat_map,
|
45
48
|
overlay_segmentation_masks,
|
46
|
-
overlay_counting_results,
|
47
49
|
owl_v2,
|
48
50
|
save_image,
|
49
51
|
save_json,
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
import os
|
2
1
|
import inspect
|
3
2
|
import logging
|
3
|
+
import os
|
4
4
|
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
5
5
|
|
6
6
|
import pandas as pd
|
@@ -10,10 +10,10 @@ from requests import Session
|
|
10
10
|
from requests.adapters import HTTPAdapter
|
11
11
|
from urllib3.util.retry import Retry
|
12
12
|
|
13
|
+
from vision_agent.tools.tools_types import BoundingBoxes
|
13
14
|
from vision_agent.utils.exceptions import RemoteToolCallFailed
|
14
15
|
from vision_agent.utils.execute import Error, MimeType
|
15
16
|
from vision_agent.utils.type_defs import LandingaiAPIKey
|
16
|
-
from vision_agent.tools.tools_types import BoundingBoxes
|
17
17
|
|
18
18
|
_LOGGER = logging.getLogger(__name__)
|
19
19
|
_LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
vision_agent/tools/tools.py
CHANGED
@@ -13,26 +13,27 @@ import cv2
|
|
13
13
|
import numpy as np
|
14
14
|
import requests
|
15
15
|
from moviepy.editor import ImageSequenceClip
|
16
|
-
from PIL import Image, ImageDraw,
|
16
|
+
from PIL import Image, ImageDraw, ImageEnhance, ImageFont
|
17
17
|
from pillow_heif import register_heif_opener # type: ignore
|
18
18
|
from pytube import YouTube # type: ignore
|
19
19
|
|
20
20
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
21
|
+
from vision_agent.lmm.lmm import OpenAILMM
|
21
22
|
from vision_agent.tools.tool_utils import (
|
23
|
+
filter_bboxes_by_threshold,
|
22
24
|
get_tool_descriptions,
|
23
25
|
get_tool_documentation,
|
24
26
|
get_tools_df,
|
25
27
|
get_tools_info,
|
26
28
|
send_inference_request,
|
27
29
|
send_task_inference_request,
|
28
|
-
filter_bboxes_by_threshold,
|
29
30
|
)
|
30
31
|
from vision_agent.tools.tools_types import (
|
31
32
|
FineTuning,
|
32
33
|
Florence2FtRequest,
|
33
34
|
JobStatus,
|
34
|
-
PromptTask,
|
35
35
|
ODResponseData,
|
36
|
+
PromptTask,
|
36
37
|
)
|
37
38
|
from vision_agent.utils import extract_frames_from_video
|
38
39
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
@@ -42,6 +43,7 @@ from vision_agent.utils.image_utils import (
|
|
42
43
|
convert_quad_box_to_bbox,
|
43
44
|
convert_to_b64,
|
44
45
|
denormalize_bbox,
|
46
|
+
encode_image_bytes,
|
45
47
|
frames_to_bytes,
|
46
48
|
get_image_size,
|
47
49
|
normalize_bbox,
|
@@ -691,6 +693,69 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
691
693
|
return cast(str, data["answer"])
|
692
694
|
|
693
695
|
|
696
|
+
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
|
697
|
+
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
|
698
|
+
including regular images or images of documents or presentations. It returns text
|
699
|
+
as an answer to the question.
|
700
|
+
|
701
|
+
Parameters:
|
702
|
+
prompt (str): The question about the image
|
703
|
+
image (np.ndarray): The reference image used for the question
|
704
|
+
|
705
|
+
Returns:
|
706
|
+
str: A string which is the answer to the given prompt.
|
707
|
+
|
708
|
+
Example
|
709
|
+
-------
|
710
|
+
>>> gpt4o_image_vqa('What is the cat doing?', image)
|
711
|
+
'drinking milk'
|
712
|
+
"""
|
713
|
+
|
714
|
+
lmm = OpenAILMM()
|
715
|
+
buffer = io.BytesIO()
|
716
|
+
Image.fromarray(image).save(buffer, format="PNG")
|
717
|
+
image_bytes = buffer.getvalue()
|
718
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
719
|
+
resp = lmm.generate(prompt, [image_b64])
|
720
|
+
return cast(str, resp)
|
721
|
+
|
722
|
+
|
723
|
+
def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
724
|
+
"""'gpt4o_video_vqa' is a tool that can answer any questions about arbitrary videos
|
725
|
+
including regular videos or videos of documents or presentations. It returns text
|
726
|
+
as an answer to the question.
|
727
|
+
|
728
|
+
Parameters:
|
729
|
+
prompt (str): The question about the video
|
730
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
731
|
+
|
732
|
+
Returns:
|
733
|
+
str: A string which is the answer to the given prompt.
|
734
|
+
|
735
|
+
Example
|
736
|
+
-------
|
737
|
+
>>> gpt4o_video_vqa('Which football player made the goal?', frames)
|
738
|
+
'Lionel Messi'
|
739
|
+
"""
|
740
|
+
|
741
|
+
lmm = OpenAILMM()
|
742
|
+
|
743
|
+
if len(frames) > 10:
|
744
|
+
step = len(frames) / 10
|
745
|
+
frames = [frames[int(i * step)] for i in range(10)]
|
746
|
+
|
747
|
+
frames_b64 = []
|
748
|
+
for frame in frames:
|
749
|
+
buffer = io.BytesIO()
|
750
|
+
Image.fromarray(frame).save(buffer, format="PNG")
|
751
|
+
image_bytes = buffer.getvalue()
|
752
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
753
|
+
frames_b64.append(image_b64)
|
754
|
+
|
755
|
+
resp = lmm.generate(prompt, frames_b64)
|
756
|
+
return cast(str, resp)
|
757
|
+
|
758
|
+
|
694
759
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
695
760
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
696
761
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from enum import Enum
|
2
|
+
from typing import List, Optional, Tuple, Union
|
2
3
|
from uuid import UUID
|
3
|
-
from typing import List, Tuple, Optional, Union
|
4
4
|
|
5
|
-
from pydantic import BaseModel, ConfigDict, Field,
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
|
6
6
|
|
7
7
|
|
8
8
|
class BboxInput(BaseModel):
|
@@ -13,6 +13,8 @@ from moviepy.editor import ImageSequenceClip
|
|
13
13
|
from PIL import Image, ImageDraw, ImageFont
|
14
14
|
from PIL.Image import Image as ImageType
|
15
15
|
|
16
|
+
from vision_agent.utils import extract_frames_from_video
|
17
|
+
|
16
18
|
COLORS = [
|
17
19
|
(158, 218, 229),
|
18
20
|
(219, 219, 141),
|
@@ -172,6 +174,51 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
172
174
|
)
|
173
175
|
|
174
176
|
|
177
|
+
def encode_image_bytes(image: bytes) -> str:
|
178
|
+
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
|
179
|
+
buffer = io.BytesIO()
|
180
|
+
image.save(buffer, format="PNG") # type: ignore
|
181
|
+
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
182
|
+
return encoded_image
|
183
|
+
|
184
|
+
|
185
|
+
def encode_media(media: Union[str, Path]) -> str:
|
186
|
+
if isinstance(media, str) and media.startswith(("http", "https")):
|
187
|
+
# for mp4 video url, we assume there is a same url but ends with png
|
188
|
+
# vision-agent-ui will upload this png when uploading the video
|
189
|
+
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
190
|
+
return media[:-4] + ".png"
|
191
|
+
return media
|
192
|
+
|
193
|
+
# if media is already a base64 encoded image return
|
194
|
+
if isinstance(media, str) and media.startswith("data:image/"):
|
195
|
+
return media
|
196
|
+
|
197
|
+
extension = "png"
|
198
|
+
extension = Path(media).suffix
|
199
|
+
if extension.lower() not in {
|
200
|
+
".jpg",
|
201
|
+
".jpeg",
|
202
|
+
".png",
|
203
|
+
".webp",
|
204
|
+
".bmp",
|
205
|
+
".mp4",
|
206
|
+
".mov",
|
207
|
+
}:
|
208
|
+
raise ValueError(f"Unsupported image extension: {extension}")
|
209
|
+
|
210
|
+
image_bytes = b""
|
211
|
+
if extension.lower() in {".mp4", ".mov"}:
|
212
|
+
frames = extract_frames_from_video(str(media), fps=1)
|
213
|
+
image = frames[len(frames) // 2]
|
214
|
+
buffer = io.BytesIO()
|
215
|
+
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
|
216
|
+
image_bytes = buffer.getvalue()
|
217
|
+
else:
|
218
|
+
image_bytes = open(media, "rb").read()
|
219
|
+
return encode_image_bytes(image_bytes)
|
220
|
+
|
221
|
+
|
175
222
|
def denormalize_bbox(
|
176
223
|
bbox: List[Union[int, float]], image_size: Tuple[int, ...]
|
177
224
|
) -> List[float]:
|
@@ -12,22 +12,22 @@ vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ
|
|
12
12
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
|
-
vision_agent/lmm/lmm.py,sha256=
|
15
|
+
vision_agent/lmm/lmm.py,sha256=092oefI65_QSRvQm2znXkjTdzlZTh-Ni_38610kfbJg,16836
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=T8Hi5aHf4J2QJDoPRvu5fxbiqMpAY-1Gi2EFIhJbf3A,2331
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
|
-
vision_agent/tools/tool_utils.py,sha256=
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
-
vision_agent/tools/tools_types.py,sha256=
|
20
|
+
vision_agent/tools/tool_utils.py,sha256=62NVlojPMf9MuJ-3yJEcrB3mzmOxN2HrNQzzjVa-FZg,7527
|
21
|
+
vision_agent/tools/tools.py,sha256=xT-lDC3NCdltK0_CDTOOiU8B2YhlIdzFhuSbvRVFBI8,65545
|
22
|
+
vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
|
23
23
|
vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
25
|
vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
|
26
|
-
vision_agent/utils/image_utils.py,sha256=
|
26
|
+
vision_agent/utils/image_utils.py,sha256=lhdvRWMbQmMMLTmJGI1dFjzNeQSLfPYJEsAkq5Ydj3Y,11476
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.124.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.124.dist-info/METADATA,sha256=mDvhJytcxFZW_B18Vkn4egk4HJ8UHYl6YQhEJHQAbPk,12255
|
32
|
+
vision_agent-0.2.124.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.124.dist-info/RECORD,,
|
File without changes
|
File without changes
|