vision-agent 0.2.123__tar.gz → 0.2.125__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.123 → vision_agent-0.2.125}/PKG-INFO +1 -1
- {vision_agent-0.2.123 → vision_agent-0.2.125}/pyproject.toml +1 -1
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/lmm/lmm.py +26 -136
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/__init__.py +5 -3
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/tools.py +70 -5
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/tools_types.py +2 -2
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/image_utils.py +47 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/LICENSE +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/README.md +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/tools/tool_utils.py +2 -2
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/utils/video.py +0 -0
@@ -1,77 +1,36 @@
|
|
1
|
-
import base64
|
2
|
-
import io
|
3
1
|
import json
|
4
2
|
import logging
|
5
3
|
import os
|
6
4
|
from abc import ABC, abstractmethod
|
7
5
|
from pathlib import Path
|
8
|
-
from typing import Any,
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
|
9
7
|
|
10
8
|
import anthropic
|
11
9
|
import requests
|
12
10
|
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
|
13
11
|
from openai import AzureOpenAI, OpenAI
|
14
|
-
from PIL import Image
|
15
12
|
|
16
|
-
|
17
|
-
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
13
|
+
from vision_agent.utils.image_utils import encode_media
|
18
14
|
|
19
15
|
from .types import Message
|
20
16
|
|
21
17
|
_LOGGER = logging.getLogger(__name__)
|
22
18
|
|
23
19
|
|
24
|
-
def encode_image_bytes(image: bytes) -> str:
|
25
|
-
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
|
26
|
-
buffer = io.BytesIO()
|
27
|
-
image.save(buffer, format="PNG") # type: ignore
|
28
|
-
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
29
|
-
return encoded_image
|
30
|
-
|
31
|
-
|
32
|
-
def encode_media(media: Union[str, Path]) -> str:
|
33
|
-
if type(media) is str and media.startswith(("http", "https")):
|
34
|
-
# for mp4 video url, we assume there is a same url but ends with png
|
35
|
-
# vision-agent-ui will upload this png when uploading the video
|
36
|
-
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
37
|
-
return media[:-4] + ".png"
|
38
|
-
return media
|
39
|
-
extension = "png"
|
40
|
-
extension = Path(media).suffix
|
41
|
-
if extension.lower() not in {
|
42
|
-
".jpg",
|
43
|
-
".jpeg",
|
44
|
-
".png",
|
45
|
-
".webp",
|
46
|
-
".bmp",
|
47
|
-
".mp4",
|
48
|
-
".mov",
|
49
|
-
}:
|
50
|
-
raise ValueError(f"Unsupported image extension: {extension}")
|
51
|
-
|
52
|
-
image_bytes = b""
|
53
|
-
if extension.lower() in {".mp4", ".mov"}:
|
54
|
-
frames = T.extract_frames(media)
|
55
|
-
image = frames[len(frames) // 2]
|
56
|
-
buffer = io.BytesIO()
|
57
|
-
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
|
58
|
-
image_bytes = buffer.getvalue()
|
59
|
-
else:
|
60
|
-
image_bytes = open(media, "rb").read()
|
61
|
-
return encode_image_bytes(image_bytes)
|
62
|
-
|
63
|
-
|
64
20
|
class LMM(ABC):
|
65
21
|
@abstractmethod
|
66
22
|
def generate(
|
67
|
-
self,
|
23
|
+
self,
|
24
|
+
prompt: str,
|
25
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
26
|
+
**kwargs: Any,
|
68
27
|
) -> Union[str, Iterator[Optional[str]]]:
|
69
28
|
pass
|
70
29
|
|
71
30
|
@abstractmethod
|
72
31
|
def chat(
|
73
32
|
self,
|
74
|
-
chat:
|
33
|
+
chat: Sequence[Message],
|
75
34
|
**kwargs: Any,
|
76
35
|
) -> Union[str, Iterator[Optional[str]]]:
|
77
36
|
pass
|
@@ -79,7 +38,7 @@ class LMM(ABC):
|
|
79
38
|
@abstractmethod
|
80
39
|
def __call__(
|
81
40
|
self,
|
82
|
-
input: Union[str,
|
41
|
+
input: Union[str, Sequence[Message]],
|
83
42
|
**kwargs: Any,
|
84
43
|
) -> Union[str, Iterator[Optional[str]]]:
|
85
44
|
pass
|
@@ -111,7 +70,7 @@ class OpenAILMM(LMM):
|
|
111
70
|
|
112
71
|
def __call__(
|
113
72
|
self,
|
114
|
-
input: Union[str,
|
73
|
+
input: Union[str, Sequence[Message]],
|
115
74
|
**kwargs: Any,
|
116
75
|
) -> Union[str, Iterator[Optional[str]]]:
|
117
76
|
if isinstance(input, str):
|
@@ -120,13 +79,13 @@ class OpenAILMM(LMM):
|
|
120
79
|
|
121
80
|
def chat(
|
122
81
|
self,
|
123
|
-
chat:
|
82
|
+
chat: Sequence[Message],
|
124
83
|
**kwargs: Any,
|
125
84
|
) -> Union[str, Iterator[Optional[str]]]:
|
126
85
|
"""Chat with the LMM model.
|
127
86
|
|
128
87
|
Parameters:
|
129
|
-
chat (
|
88
|
+
chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
|
130
89
|
messages. The messages can be in the format:
|
131
90
|
[{"role": "user", "content": "Hello!"}, ...]
|
132
91
|
or if it contains media, it should be in the format:
|
@@ -147,6 +106,7 @@ class OpenAILMM(LMM):
|
|
147
106
|
"url": (
|
148
107
|
encoded_media
|
149
108
|
if encoded_media.startswith(("http", "https"))
|
109
|
+
or encoded_media.startswith("data:image/")
|
150
110
|
else f"data:image/png;base64,{encoded_media}"
|
151
111
|
),
|
152
112
|
"detail": "low",
|
@@ -174,7 +134,7 @@ class OpenAILMM(LMM):
|
|
174
134
|
def generate(
|
175
135
|
self,
|
176
136
|
prompt: str,
|
177
|
-
media: Optional[
|
137
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
178
138
|
**kwargs: Any,
|
179
139
|
) -> Union[str, Iterator[Optional[str]]]:
|
180
140
|
message: List[Dict[str, Any]] = [
|
@@ -192,7 +152,12 @@ class OpenAILMM(LMM):
|
|
192
152
|
{
|
193
153
|
"type": "image_url",
|
194
154
|
"image_url": {
|
195
|
-
"url":
|
155
|
+
"url": (
|
156
|
+
encoded_media
|
157
|
+
if encoded_media.startswith(("http", "https"))
|
158
|
+
or encoded_media.startswith("data:image/")
|
159
|
+
else f"data:image/png;base64,{encoded_media}"
|
160
|
+
),
|
196
161
|
"detail": "low",
|
197
162
|
},
|
198
163
|
},
|
@@ -214,81 +179,6 @@ class OpenAILMM(LMM):
|
|
214
179
|
else:
|
215
180
|
return cast(str, response.choices[0].message.content)
|
216
181
|
|
217
|
-
def generate_classifier(self, question: str) -> Callable:
|
218
|
-
api_doc = T.get_tool_documentation([T.clip])
|
219
|
-
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
220
|
-
response = self.client.chat.completions.create(
|
221
|
-
model=self.model_name,
|
222
|
-
messages=[
|
223
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
224
|
-
{"role": "user", "content": prompt},
|
225
|
-
],
|
226
|
-
response_format={"type": "json_object"},
|
227
|
-
)
|
228
|
-
|
229
|
-
try:
|
230
|
-
params = json.loads(cast(str, response.choices[0].message.content))[
|
231
|
-
"Parameters"
|
232
|
-
]
|
233
|
-
except json.JSONDecodeError:
|
234
|
-
_LOGGER.error(
|
235
|
-
f"Failed to decode response: {response.choices[0].message.content}"
|
236
|
-
)
|
237
|
-
raise ValueError("Failed to decode response")
|
238
|
-
|
239
|
-
return lambda x: T.clip(x, params["prompt"])
|
240
|
-
|
241
|
-
def generate_detector(self, question: str) -> Callable:
|
242
|
-
api_doc = T.get_tool_documentation([T.owl_v2])
|
243
|
-
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
244
|
-
response = self.client.chat.completions.create(
|
245
|
-
model=self.model_name,
|
246
|
-
messages=[
|
247
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
248
|
-
{"role": "user", "content": prompt},
|
249
|
-
],
|
250
|
-
response_format={"type": "json_object"},
|
251
|
-
)
|
252
|
-
|
253
|
-
try:
|
254
|
-
params = json.loads(cast(str, response.choices[0].message.content))[
|
255
|
-
"Parameters"
|
256
|
-
]
|
257
|
-
except json.JSONDecodeError:
|
258
|
-
_LOGGER.error(
|
259
|
-
f"Failed to decode response: {response.choices[0].message.content}"
|
260
|
-
)
|
261
|
-
raise ValueError("Failed to decode response")
|
262
|
-
|
263
|
-
return lambda x: T.owl_v2(params["prompt"], x)
|
264
|
-
|
265
|
-
def generate_segmentor(self, question: str) -> Callable:
|
266
|
-
api_doc = T.get_tool_documentation([T.grounding_sam])
|
267
|
-
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
268
|
-
response = self.client.chat.completions.create(
|
269
|
-
model=self.model_name,
|
270
|
-
messages=[
|
271
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
272
|
-
{"role": "user", "content": prompt},
|
273
|
-
],
|
274
|
-
response_format={"type": "json_object"},
|
275
|
-
)
|
276
|
-
|
277
|
-
try:
|
278
|
-
params = json.loads(cast(str, response.choices[0].message.content))[
|
279
|
-
"Parameters"
|
280
|
-
]
|
281
|
-
except json.JSONDecodeError:
|
282
|
-
_LOGGER.error(
|
283
|
-
f"Failed to decode response: {response.choices[0].message.content}"
|
284
|
-
)
|
285
|
-
raise ValueError("Failed to decode response")
|
286
|
-
|
287
|
-
return lambda x: T.grounding_sam(params["prompt"], x)
|
288
|
-
|
289
|
-
def generate_image_qa_tool(self, question: str) -> Callable:
|
290
|
-
return lambda x: T.git_vqa_v2(question, x)
|
291
|
-
|
292
182
|
|
293
183
|
class AzureOpenAILMM(OpenAILMM):
|
294
184
|
def __init__(
|
@@ -362,7 +252,7 @@ class OllamaLMM(LMM):
|
|
362
252
|
|
363
253
|
def __call__(
|
364
254
|
self,
|
365
|
-
input: Union[str,
|
255
|
+
input: Union[str, Sequence[Message]],
|
366
256
|
**kwargs: Any,
|
367
257
|
) -> Union[str, Iterator[Optional[str]]]:
|
368
258
|
if isinstance(input, str):
|
@@ -371,13 +261,13 @@ class OllamaLMM(LMM):
|
|
371
261
|
|
372
262
|
def chat(
|
373
263
|
self,
|
374
|
-
chat:
|
264
|
+
chat: Sequence[Message],
|
375
265
|
**kwargs: Any,
|
376
266
|
) -> Union[str, Iterator[Optional[str]]]:
|
377
267
|
"""Chat with the LMM model.
|
378
268
|
|
379
269
|
Parameters:
|
380
|
-
chat (
|
270
|
+
chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
|
381
271
|
messages. The messages can be in the format:
|
382
272
|
[{"role": "user", "content": "Hello!"}, ...]
|
383
273
|
or if it contains media, it should be in the format:
|
@@ -429,7 +319,7 @@ class OllamaLMM(LMM):
|
|
429
319
|
def generate(
|
430
320
|
self,
|
431
321
|
prompt: str,
|
432
|
-
media: Optional[
|
322
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
433
323
|
**kwargs: Any,
|
434
324
|
) -> Union[str, Iterator[Optional[str]]]:
|
435
325
|
url = f"{self.url}/generate"
|
@@ -493,7 +383,7 @@ class ClaudeSonnetLMM(LMM):
|
|
493
383
|
|
494
384
|
def __call__(
|
495
385
|
self,
|
496
|
-
input: Union[str,
|
386
|
+
input: Union[str, Sequence[Dict[str, Any]]],
|
497
387
|
**kwargs: Any,
|
498
388
|
) -> Union[str, Iterator[Optional[str]]]:
|
499
389
|
if isinstance(input, str):
|
@@ -502,7 +392,7 @@ class ClaudeSonnetLMM(LMM):
|
|
502
392
|
|
503
393
|
def chat(
|
504
394
|
self,
|
505
|
-
chat:
|
395
|
+
chat: Sequence[Dict[str, Any]],
|
506
396
|
**kwargs: Any,
|
507
397
|
) -> Union[str, Iterator[Optional[str]]]:
|
508
398
|
messages: List[MessageParam] = []
|
@@ -551,7 +441,7 @@ class ClaudeSonnetLMM(LMM):
|
|
551
441
|
def generate(
|
552
442
|
self,
|
553
443
|
prompt: str,
|
554
|
-
media: Optional[
|
444
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
555
445
|
**kwargs: Any,
|
556
446
|
) -> Union[str, Iterator[Optional[str]]]:
|
557
447
|
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
@@ -16,6 +16,8 @@ from .tools import (
|
|
16
16
|
clip,
|
17
17
|
closest_box_distance,
|
18
18
|
closest_mask_distance,
|
19
|
+
countgd_counting,
|
20
|
+
countgd_example_based_counting,
|
19
21
|
depth_anything_v2,
|
20
22
|
detr_segmentation,
|
21
23
|
dpt_hybrid_midas,
|
@@ -30,6 +32,8 @@ from .tools import (
|
|
30
32
|
generate_soft_edge_image,
|
31
33
|
get_tool_documentation,
|
32
34
|
git_vqa_v2,
|
35
|
+
gpt4o_image_vqa,
|
36
|
+
gpt4o_video_vqa,
|
33
37
|
grounding_dino,
|
34
38
|
grounding_sam,
|
35
39
|
ixc25_image_vqa,
|
@@ -37,13 +41,11 @@ from .tools import (
|
|
37
41
|
load_image,
|
38
42
|
loca_visual_prompt_counting,
|
39
43
|
loca_zero_shot_counting,
|
40
|
-
countgd_counting,
|
41
|
-
countgd_example_based_counting,
|
42
44
|
ocr,
|
43
45
|
overlay_bounding_boxes,
|
46
|
+
overlay_counting_results,
|
44
47
|
overlay_heat_map,
|
45
48
|
overlay_segmentation_masks,
|
46
|
-
overlay_counting_results,
|
47
49
|
owl_v2,
|
48
50
|
save_image,
|
49
51
|
save_json,
|
@@ -13,26 +13,27 @@ import cv2
|
|
13
13
|
import numpy as np
|
14
14
|
import requests
|
15
15
|
from moviepy.editor import ImageSequenceClip
|
16
|
-
from PIL import Image, ImageDraw,
|
16
|
+
from PIL import Image, ImageDraw, ImageEnhance, ImageFont
|
17
17
|
from pillow_heif import register_heif_opener # type: ignore
|
18
18
|
from pytube import YouTube # type: ignore
|
19
19
|
|
20
20
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
21
|
+
from vision_agent.lmm.lmm import OpenAILMM
|
21
22
|
from vision_agent.tools.tool_utils import (
|
23
|
+
filter_bboxes_by_threshold,
|
22
24
|
get_tool_descriptions,
|
23
25
|
get_tool_documentation,
|
24
26
|
get_tools_df,
|
25
27
|
get_tools_info,
|
26
28
|
send_inference_request,
|
27
29
|
send_task_inference_request,
|
28
|
-
filter_bboxes_by_threshold,
|
29
30
|
)
|
30
31
|
from vision_agent.tools.tools_types import (
|
31
32
|
FineTuning,
|
32
33
|
Florence2FtRequest,
|
33
34
|
JobStatus,
|
34
|
-
PromptTask,
|
35
35
|
ODResponseData,
|
36
|
+
PromptTask,
|
36
37
|
)
|
37
38
|
from vision_agent.utils import extract_frames_from_video
|
38
39
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
@@ -42,6 +43,7 @@ from vision_agent.utils.image_utils import (
|
|
42
43
|
convert_quad_box_to_bbox,
|
43
44
|
convert_to_b64,
|
44
45
|
denormalize_bbox,
|
46
|
+
encode_image_bytes,
|
45
47
|
frames_to_bytes,
|
46
48
|
get_image_size,
|
47
49
|
normalize_bbox,
|
@@ -691,6 +693,69 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
691
693
|
return cast(str, data["answer"])
|
692
694
|
|
693
695
|
|
696
|
+
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
|
697
|
+
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
|
698
|
+
including regular images or images of documents or presentations. It returns text
|
699
|
+
as an answer to the question.
|
700
|
+
|
701
|
+
Parameters:
|
702
|
+
prompt (str): The question about the image
|
703
|
+
image (np.ndarray): The reference image used for the question
|
704
|
+
|
705
|
+
Returns:
|
706
|
+
str: A string which is the answer to the given prompt.
|
707
|
+
|
708
|
+
Example
|
709
|
+
-------
|
710
|
+
>>> gpt4o_image_vqa('What is the cat doing?', image)
|
711
|
+
'drinking milk'
|
712
|
+
"""
|
713
|
+
|
714
|
+
lmm = OpenAILMM()
|
715
|
+
buffer = io.BytesIO()
|
716
|
+
Image.fromarray(image).save(buffer, format="PNG")
|
717
|
+
image_bytes = buffer.getvalue()
|
718
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
719
|
+
resp = lmm.generate(prompt, [image_b64])
|
720
|
+
return cast(str, resp)
|
721
|
+
|
722
|
+
|
723
|
+
def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
724
|
+
"""'gpt4o_video_vqa' is a tool that can answer any questions about arbitrary videos
|
725
|
+
including regular videos or videos of documents or presentations. It returns text
|
726
|
+
as an answer to the question.
|
727
|
+
|
728
|
+
Parameters:
|
729
|
+
prompt (str): The question about the video
|
730
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
731
|
+
|
732
|
+
Returns:
|
733
|
+
str: A string which is the answer to the given prompt.
|
734
|
+
|
735
|
+
Example
|
736
|
+
-------
|
737
|
+
>>> gpt4o_video_vqa('Which football player made the goal?', frames)
|
738
|
+
'Lionel Messi'
|
739
|
+
"""
|
740
|
+
|
741
|
+
lmm = OpenAILMM()
|
742
|
+
|
743
|
+
if len(frames) > 10:
|
744
|
+
step = len(frames) / 10
|
745
|
+
frames = [frames[int(i * step)] for i in range(10)]
|
746
|
+
|
747
|
+
frames_b64 = []
|
748
|
+
for frame in frames:
|
749
|
+
buffer = io.BytesIO()
|
750
|
+
Image.fromarray(frame).save(buffer, format="PNG")
|
751
|
+
image_bytes = buffer.getvalue()
|
752
|
+
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
753
|
+
frames_b64.append(image_b64)
|
754
|
+
|
755
|
+
resp = lmm.generate(prompt, frames_b64)
|
756
|
+
return cast(str, resp)
|
757
|
+
|
758
|
+
|
694
759
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
695
760
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
696
761
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -1755,7 +1820,6 @@ def overlay_counting_results(
|
|
1755
1820
|
|
1756
1821
|
FUNCTION_TOOLS = [
|
1757
1822
|
owl_v2,
|
1758
|
-
extract_frames,
|
1759
1823
|
ocr,
|
1760
1824
|
clip,
|
1761
1825
|
vit_image_classification,
|
@@ -1776,6 +1840,7 @@ FUNCTION_TOOLS = [
|
|
1776
1840
|
]
|
1777
1841
|
|
1778
1842
|
UTIL_TOOLS = [
|
1843
|
+
extract_frames,
|
1779
1844
|
save_json,
|
1780
1845
|
load_image,
|
1781
1846
|
save_image,
|
@@ -1791,7 +1856,7 @@ TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
|
|
1791
1856
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
1792
1857
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
1793
1858
|
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
|
1794
|
-
TOOLS_INFO = get_tools_info(
|
1859
|
+
TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
|
1795
1860
|
UTILITIES_DOCSTRING = get_tool_documentation(
|
1796
1861
|
[
|
1797
1862
|
save_json,
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from enum import Enum
|
2
|
+
from typing import List, Optional, Tuple, Union
|
2
3
|
from uuid import UUID
|
3
|
-
from typing import List, Tuple, Optional, Union
|
4
4
|
|
5
|
-
from pydantic import BaseModel, ConfigDict, Field,
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
|
6
6
|
|
7
7
|
|
8
8
|
class BboxInput(BaseModel):
|
@@ -13,6 +13,8 @@ from moviepy.editor import ImageSequenceClip
|
|
13
13
|
from PIL import Image, ImageDraw, ImageFont
|
14
14
|
from PIL.Image import Image as ImageType
|
15
15
|
|
16
|
+
from vision_agent.utils import extract_frames_from_video
|
17
|
+
|
16
18
|
COLORS = [
|
17
19
|
(158, 218, 229),
|
18
20
|
(219, 219, 141),
|
@@ -172,6 +174,51 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
172
174
|
)
|
173
175
|
|
174
176
|
|
177
|
+
def encode_image_bytes(image: bytes) -> str:
|
178
|
+
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
|
179
|
+
buffer = io.BytesIO()
|
180
|
+
image.save(buffer, format="PNG") # type: ignore
|
181
|
+
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
182
|
+
return encoded_image
|
183
|
+
|
184
|
+
|
185
|
+
def encode_media(media: Union[str, Path]) -> str:
|
186
|
+
if isinstance(media, str) and media.startswith(("http", "https")):
|
187
|
+
# for mp4 video url, we assume there is a same url but ends with png
|
188
|
+
# vision-agent-ui will upload this png when uploading the video
|
189
|
+
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
190
|
+
return media[:-4] + ".png"
|
191
|
+
return media
|
192
|
+
|
193
|
+
# if media is already a base64 encoded image return
|
194
|
+
if isinstance(media, str) and media.startswith("data:image/"):
|
195
|
+
return media
|
196
|
+
|
197
|
+
extension = "png"
|
198
|
+
extension = Path(media).suffix
|
199
|
+
if extension.lower() not in {
|
200
|
+
".jpg",
|
201
|
+
".jpeg",
|
202
|
+
".png",
|
203
|
+
".webp",
|
204
|
+
".bmp",
|
205
|
+
".mp4",
|
206
|
+
".mov",
|
207
|
+
}:
|
208
|
+
raise ValueError(f"Unsupported image extension: {extension}")
|
209
|
+
|
210
|
+
image_bytes = b""
|
211
|
+
if extension.lower() in {".mp4", ".mov"}:
|
212
|
+
frames = extract_frames_from_video(str(media), fps=1)
|
213
|
+
image = frames[len(frames) // 2]
|
214
|
+
buffer = io.BytesIO()
|
215
|
+
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
|
216
|
+
image_bytes = buffer.getvalue()
|
217
|
+
else:
|
218
|
+
image_bytes = open(media, "rb").read()
|
219
|
+
return encode_image_bytes(image_bytes)
|
220
|
+
|
221
|
+
|
175
222
|
def denormalize_bbox(
|
176
223
|
bbox: List[Union[int, float]], image_size: Tuple[int, ...]
|
177
224
|
) -> List[float]:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.123 → vision_agent-0.2.125}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
|
-
import os
|
2
1
|
import inspect
|
3
2
|
import logging
|
3
|
+
import os
|
4
4
|
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
5
5
|
|
6
6
|
import pandas as pd
|
@@ -10,10 +10,10 @@ from requests import Session
|
|
10
10
|
from requests.adapters import HTTPAdapter
|
11
11
|
from urllib3.util.retry import Retry
|
12
12
|
|
13
|
+
from vision_agent.tools.tools_types import BoundingBoxes
|
13
14
|
from vision_agent.utils.exceptions import RemoteToolCallFailed
|
14
15
|
from vision_agent.utils.execute import Error, MimeType
|
15
16
|
from vision_agent.utils.type_defs import LandingaiAPIKey
|
16
|
-
from vision_agent.tools.tools_types import BoundingBoxes
|
17
17
|
|
18
18
|
_LOGGER = logging.getLogger(__name__)
|
19
19
|
_LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|