vision-agent 0.2.123__tar.gz → 0.2.124__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.123 → vision_agent-0.2.124}/PKG-INFO +1 -1
  2. {vision_agent-0.2.123 → vision_agent-0.2.124}/pyproject.toml +1 -1
  3. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/lmm/lmm.py +26 -136
  4. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/tools/__init__.py +5 -3
  5. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/tools/tools.py +68 -3
  6. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/tools/tools_types.py +2 -2
  7. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/image_utils.py +47 -0
  8. {vision_agent-0.2.123 → vision_agent-0.2.124}/LICENSE +0 -0
  9. {vision_agent-0.2.123 → vision_agent-0.2.124}/README.md +0 -0
  10. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/__init__.py +0 -0
  12. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/agent.py +0 -0
  13. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/agent_utils.py +0 -0
  14. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/vision_agent.py +0 -0
  15. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/vision_agent_coder.py +0 -0
  16. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  17. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/agent/vision_agent_prompts.py +0 -0
  18. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/clients/__init__.py +0 -0
  19. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/clients/http.py +0 -0
  20. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/clients/landing_public_api.py +0 -0
  21. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/fonts/__init__.py +0 -0
  22. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  23. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/lmm/__init__.py +0 -0
  24. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/lmm/types.py +0 -0
  25. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/tools/meta_tools.py +0 -0
  26. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/tools/tool_utils.py +2 -2
  28. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/__init__.py +0 -0
  29. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/exceptions.py +0 -0
  30. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/execute.py +0 -0
  31. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/sim.py +0 -0
  32. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/type_defs.py +0 -0
  33. {vision_agent-0.2.123 → vision_agent-0.2.124}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.123
3
+ Version: 0.2.124
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.123"
7
+ version = "0.2.124"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -1,77 +1,36 @@
1
- import base64
2
- import io
3
1
  import json
4
2
  import logging
5
3
  import os
6
4
  from abc import ABC, abstractmethod
7
5
  from pathlib import Path
8
- from typing import Any, Callable, Dict, Iterator, List, Optional, Union, cast
6
+ from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
9
7
 
10
8
  import anthropic
11
9
  import requests
12
10
  from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
13
11
  from openai import AzureOpenAI, OpenAI
14
- from PIL import Image
15
12
 
16
- import vision_agent.tools as T
17
- from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
13
+ from vision_agent.utils.image_utils import encode_media
18
14
 
19
15
  from .types import Message
20
16
 
21
17
  _LOGGER = logging.getLogger(__name__)
22
18
 
23
19
 
24
- def encode_image_bytes(image: bytes) -> str:
25
- image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
26
- buffer = io.BytesIO()
27
- image.save(buffer, format="PNG") # type: ignore
28
- encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
29
- return encoded_image
30
-
31
-
32
- def encode_media(media: Union[str, Path]) -> str:
33
- if type(media) is str and media.startswith(("http", "https")):
34
- # for mp4 video url, we assume there is a same url but ends with png
35
- # vision-agent-ui will upload this png when uploading the video
36
- if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
37
- return media[:-4] + ".png"
38
- return media
39
- extension = "png"
40
- extension = Path(media).suffix
41
- if extension.lower() not in {
42
- ".jpg",
43
- ".jpeg",
44
- ".png",
45
- ".webp",
46
- ".bmp",
47
- ".mp4",
48
- ".mov",
49
- }:
50
- raise ValueError(f"Unsupported image extension: {extension}")
51
-
52
- image_bytes = b""
53
- if extension.lower() in {".mp4", ".mov"}:
54
- frames = T.extract_frames(media)
55
- image = frames[len(frames) // 2]
56
- buffer = io.BytesIO()
57
- Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
58
- image_bytes = buffer.getvalue()
59
- else:
60
- image_bytes = open(media, "rb").read()
61
- return encode_image_bytes(image_bytes)
62
-
63
-
64
20
  class LMM(ABC):
65
21
  @abstractmethod
66
22
  def generate(
67
- self, prompt: str, media: Optional[List[Union[str, Path]]] = None, **kwargs: Any
23
+ self,
24
+ prompt: str,
25
+ media: Optional[Sequence[Union[str, Path]]] = None,
26
+ **kwargs: Any,
68
27
  ) -> Union[str, Iterator[Optional[str]]]:
69
28
  pass
70
29
 
71
30
  @abstractmethod
72
31
  def chat(
73
32
  self,
74
- chat: List[Message],
33
+ chat: Sequence[Message],
75
34
  **kwargs: Any,
76
35
  ) -> Union[str, Iterator[Optional[str]]]:
77
36
  pass
@@ -79,7 +38,7 @@ class LMM(ABC):
79
38
  @abstractmethod
80
39
  def __call__(
81
40
  self,
82
- input: Union[str, List[Message]],
41
+ input: Union[str, Sequence[Message]],
83
42
  **kwargs: Any,
84
43
  ) -> Union[str, Iterator[Optional[str]]]:
85
44
  pass
@@ -111,7 +70,7 @@ class OpenAILMM(LMM):
111
70
 
112
71
  def __call__(
113
72
  self,
114
- input: Union[str, List[Message]],
73
+ input: Union[str, Sequence[Message]],
115
74
  **kwargs: Any,
116
75
  ) -> Union[str, Iterator[Optional[str]]]:
117
76
  if isinstance(input, str):
@@ -120,13 +79,13 @@ class OpenAILMM(LMM):
120
79
 
121
80
  def chat(
122
81
  self,
123
- chat: List[Message],
82
+ chat: Sequence[Message],
124
83
  **kwargs: Any,
125
84
  ) -> Union[str, Iterator[Optional[str]]]:
126
85
  """Chat with the LMM model.
127
86
 
128
87
  Parameters:
129
- chat (List[Dict[str, str]]): A list of dictionaries containing the chat
88
+ chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
130
89
  messages. The messages can be in the format:
131
90
  [{"role": "user", "content": "Hello!"}, ...]
132
91
  or if it contains media, it should be in the format:
@@ -147,6 +106,7 @@ class OpenAILMM(LMM):
147
106
  "url": (
148
107
  encoded_media
149
108
  if encoded_media.startswith(("http", "https"))
109
+ or encoded_media.startswith("data:image/")
150
110
  else f"data:image/png;base64,{encoded_media}"
151
111
  ),
152
112
  "detail": "low",
@@ -174,7 +134,7 @@ class OpenAILMM(LMM):
174
134
  def generate(
175
135
  self,
176
136
  prompt: str,
177
- media: Optional[List[Union[str, Path]]] = None,
137
+ media: Optional[Sequence[Union[str, Path]]] = None,
178
138
  **kwargs: Any,
179
139
  ) -> Union[str, Iterator[Optional[str]]]:
180
140
  message: List[Dict[str, Any]] = [
@@ -192,7 +152,12 @@ class OpenAILMM(LMM):
192
152
  {
193
153
  "type": "image_url",
194
154
  "image_url": {
195
- "url": f"data:image/png;base64,{encoded_media}",
155
+ "url": (
156
+ encoded_media
157
+ if encoded_media.startswith(("http", "https"))
158
+ or encoded_media.startswith("data:image/")
159
+ else f"data:image/png;base64,{encoded_media}"
160
+ ),
196
161
  "detail": "low",
197
162
  },
198
163
  },
@@ -214,81 +179,6 @@ class OpenAILMM(LMM):
214
179
  else:
215
180
  return cast(str, response.choices[0].message.content)
216
181
 
217
- def generate_classifier(self, question: str) -> Callable:
218
- api_doc = T.get_tool_documentation([T.clip])
219
- prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
220
- response = self.client.chat.completions.create(
221
- model=self.model_name,
222
- messages=[
223
- {"role": "system", "content": SYSTEM_PROMPT},
224
- {"role": "user", "content": prompt},
225
- ],
226
- response_format={"type": "json_object"},
227
- )
228
-
229
- try:
230
- params = json.loads(cast(str, response.choices[0].message.content))[
231
- "Parameters"
232
- ]
233
- except json.JSONDecodeError:
234
- _LOGGER.error(
235
- f"Failed to decode response: {response.choices[0].message.content}"
236
- )
237
- raise ValueError("Failed to decode response")
238
-
239
- return lambda x: T.clip(x, params["prompt"])
240
-
241
- def generate_detector(self, question: str) -> Callable:
242
- api_doc = T.get_tool_documentation([T.owl_v2])
243
- prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
244
- response = self.client.chat.completions.create(
245
- model=self.model_name,
246
- messages=[
247
- {"role": "system", "content": SYSTEM_PROMPT},
248
- {"role": "user", "content": prompt},
249
- ],
250
- response_format={"type": "json_object"},
251
- )
252
-
253
- try:
254
- params = json.loads(cast(str, response.choices[0].message.content))[
255
- "Parameters"
256
- ]
257
- except json.JSONDecodeError:
258
- _LOGGER.error(
259
- f"Failed to decode response: {response.choices[0].message.content}"
260
- )
261
- raise ValueError("Failed to decode response")
262
-
263
- return lambda x: T.owl_v2(params["prompt"], x)
264
-
265
- def generate_segmentor(self, question: str) -> Callable:
266
- api_doc = T.get_tool_documentation([T.grounding_sam])
267
- prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
268
- response = self.client.chat.completions.create(
269
- model=self.model_name,
270
- messages=[
271
- {"role": "system", "content": SYSTEM_PROMPT},
272
- {"role": "user", "content": prompt},
273
- ],
274
- response_format={"type": "json_object"},
275
- )
276
-
277
- try:
278
- params = json.loads(cast(str, response.choices[0].message.content))[
279
- "Parameters"
280
- ]
281
- except json.JSONDecodeError:
282
- _LOGGER.error(
283
- f"Failed to decode response: {response.choices[0].message.content}"
284
- )
285
- raise ValueError("Failed to decode response")
286
-
287
- return lambda x: T.grounding_sam(params["prompt"], x)
288
-
289
- def generate_image_qa_tool(self, question: str) -> Callable:
290
- return lambda x: T.git_vqa_v2(question, x)
291
-
292
182
 
293
183
  class AzureOpenAILMM(OpenAILMM):
294
184
  def __init__(
@@ -362,7 +252,7 @@ class OllamaLMM(LMM):
362
252
 
363
253
  def __call__(
364
254
  self,
365
- input: Union[str, List[Message]],
255
+ input: Union[str, Sequence[Message]],
366
256
  **kwargs: Any,
367
257
  ) -> Union[str, Iterator[Optional[str]]]:
368
258
  if isinstance(input, str):
@@ -371,13 +261,13 @@ class OllamaLMM(LMM):
371
261
 
372
262
  def chat(
373
263
  self,
374
- chat: List[Message],
264
+ chat: Sequence[Message],
375
265
  **kwargs: Any,
376
266
  ) -> Union[str, Iterator[Optional[str]]]:
377
267
  """Chat with the LMM model.
378
268
 
379
269
  Parameters:
380
- chat (List[Dict[str, str]]): A list of dictionaries containing the chat
270
+ chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
381
271
  messages. The messages can be in the format:
382
272
  [{"role": "user", "content": "Hello!"}, ...]
383
273
  or if it contains media, it should be in the format:
@@ -429,7 +319,7 @@ class OllamaLMM(LMM):
429
319
  def generate(
430
320
  self,
431
321
  prompt: str,
432
- media: Optional[List[Union[str, Path]]] = None,
322
+ media: Optional[Sequence[Union[str, Path]]] = None,
433
323
  **kwargs: Any,
434
324
  ) -> Union[str, Iterator[Optional[str]]]:
435
325
  url = f"{self.url}/generate"
@@ -493,7 +383,7 @@ class ClaudeSonnetLMM(LMM):
493
383
 
494
384
  def __call__(
495
385
  self,
496
- input: Union[str, List[Dict[str, Any]]],
386
+ input: Union[str, Sequence[Dict[str, Any]]],
497
387
  **kwargs: Any,
498
388
  ) -> Union[str, Iterator[Optional[str]]]:
499
389
  if isinstance(input, str):
@@ -502,7 +392,7 @@ class ClaudeSonnetLMM(LMM):
502
392
 
503
393
  def chat(
504
394
  self,
505
- chat: List[Dict[str, Any]],
395
+ chat: Sequence[Dict[str, Any]],
506
396
  **kwargs: Any,
507
397
  ) -> Union[str, Iterator[Optional[str]]]:
508
398
  messages: List[MessageParam] = []
@@ -551,7 +441,7 @@ class ClaudeSonnetLMM(LMM):
551
441
  def generate(
552
442
  self,
553
443
  prompt: str,
554
- media: Optional[List[Union[str, Path]]] = None,
444
+ media: Optional[Sequence[Union[str, Path]]] = None,
555
445
  **kwargs: Any,
556
446
  ) -> Union[str, Iterator[Optional[str]]]:
557
447
  content: List[Union[TextBlockParam, ImageBlockParam]] = [
@@ -16,6 +16,8 @@ from .tools import (
16
16
  clip,
17
17
  closest_box_distance,
18
18
  closest_mask_distance,
19
+ countgd_counting,
20
+ countgd_example_based_counting,
19
21
  depth_anything_v2,
20
22
  detr_segmentation,
21
23
  dpt_hybrid_midas,
@@ -30,6 +32,8 @@ from .tools import (
30
32
  generate_soft_edge_image,
31
33
  get_tool_documentation,
32
34
  git_vqa_v2,
35
+ gpt4o_image_vqa,
36
+ gpt4o_video_vqa,
33
37
  grounding_dino,
34
38
  grounding_sam,
35
39
  ixc25_image_vqa,
@@ -37,13 +41,11 @@ from .tools import (
37
41
  load_image,
38
42
  loca_visual_prompt_counting,
39
43
  loca_zero_shot_counting,
40
- countgd_counting,
41
- countgd_example_based_counting,
42
44
  ocr,
43
45
  overlay_bounding_boxes,
46
+ overlay_counting_results,
44
47
  overlay_heat_map,
45
48
  overlay_segmentation_masks,
46
- overlay_counting_results,
47
49
  owl_v2,
48
50
  save_image,
49
51
  save_json,
@@ -13,26 +13,27 @@ import cv2
13
13
  import numpy as np
14
14
  import requests
15
15
  from moviepy.editor import ImageSequenceClip
16
- from PIL import Image, ImageDraw, ImageFont, ImageEnhance
16
+ from PIL import Image, ImageDraw, ImageEnhance, ImageFont
17
17
  from pillow_heif import register_heif_opener # type: ignore
18
18
  from pytube import YouTube # type: ignore
19
19
 
20
20
  from vision_agent.clients.landing_public_api import LandingPublicAPI
21
+ from vision_agent.lmm.lmm import OpenAILMM
21
22
  from vision_agent.tools.tool_utils import (
23
+ filter_bboxes_by_threshold,
22
24
  get_tool_descriptions,
23
25
  get_tool_documentation,
24
26
  get_tools_df,
25
27
  get_tools_info,
26
28
  send_inference_request,
27
29
  send_task_inference_request,
28
- filter_bboxes_by_threshold,
29
30
  )
30
31
  from vision_agent.tools.tools_types import (
31
32
  FineTuning,
32
33
  Florence2FtRequest,
33
34
  JobStatus,
34
- PromptTask,
35
35
  ODResponseData,
36
+ PromptTask,
36
37
  )
37
38
  from vision_agent.utils import extract_frames_from_video
38
39
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
@@ -42,6 +43,7 @@ from vision_agent.utils.image_utils import (
42
43
  convert_quad_box_to_bbox,
43
44
  convert_to_b64,
44
45
  denormalize_bbox,
46
+ encode_image_bytes,
45
47
  frames_to_bytes,
46
48
  get_image_size,
47
49
  normalize_bbox,
@@ -691,6 +693,69 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
691
693
  return cast(str, data["answer"])
692
694
 
693
695
 
696
+ def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
697
+ """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
698
+ including regular images or images of documents or presentations. It returns text
699
+ as an answer to the question.
700
+
701
+ Parameters:
702
+ prompt (str): The question about the image
703
+ image (np.ndarray): The reference image used for the question
704
+
705
+ Returns:
706
+ str: A string which is the answer to the given prompt.
707
+
708
+ Example
709
+ -------
710
+ >>> gpt4o_image_vqa('What is the cat doing?', image)
711
+ 'drinking milk'
712
+ """
713
+
714
+ lmm = OpenAILMM()
715
+ buffer = io.BytesIO()
716
+ Image.fromarray(image).save(buffer, format="PNG")
717
+ image_bytes = buffer.getvalue()
718
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
719
+ resp = lmm.generate(prompt, [image_b64])
720
+ return cast(str, resp)
721
+
722
+
723
+ def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
724
+ """'gpt4o_video_vqa' is a tool that can answer any questions about arbitrary videos
725
+ including regular videos or videos of documents or presentations. It returns text
726
+ as an answer to the question.
727
+
728
+ Parameters:
729
+ prompt (str): The question about the video
730
+ frames (List[np.ndarray]): The reference frames used for the question
731
+
732
+ Returns:
733
+ str: A string which is the answer to the given prompt.
734
+
735
+ Example
736
+ -------
737
+ >>> gpt4o_video_vqa('Which football player made the goal?', frames)
738
+ 'Lionel Messi'
739
+ """
740
+
741
+ lmm = OpenAILMM()
742
+
743
+ if len(frames) > 10:
744
+ step = len(frames) / 10
745
+ frames = [frames[int(i * step)] for i in range(10)]
746
+
747
+ frames_b64 = []
748
+ for frame in frames:
749
+ buffer = io.BytesIO()
750
+ Image.fromarray(frame).save(buffer, format="PNG")
751
+ image_bytes = buffer.getvalue()
752
+ image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
753
+ frames_b64.append(image_b64)
754
+
755
+ resp = lmm.generate(prompt, frames_b64)
756
+ return cast(str, resp)
757
+
758
+
694
759
  def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
695
760
  """'git_vqa_v2' is a tool that can answer questions about the visual
696
761
  contents of an image given a question and an image. It returns an answer to the
@@ -1,8 +1,8 @@
1
1
  from enum import Enum
2
+ from typing import List, Optional, Tuple, Union
2
3
  from uuid import UUID
3
- from typing import List, Tuple, Optional, Union
4
4
 
5
- from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo
5
+ from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
6
6
 
7
7
 
8
8
  class BboxInput(BaseModel):
@@ -13,6 +13,8 @@ from moviepy.editor import ImageSequenceClip
13
13
  from PIL import Image, ImageDraw, ImageFont
14
14
  from PIL.Image import Image as ImageType
15
15
 
16
+ from vision_agent.utils import extract_frames_from_video
17
+
16
18
  COLORS = [
17
19
  (158, 218, 229),
18
20
  (219, 219, 141),
@@ -172,6 +174,51 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
172
174
  )
173
175
 
174
176
 
177
+ def encode_image_bytes(image: bytes) -> str:
178
+ image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
179
+ buffer = io.BytesIO()
180
+ image.save(buffer, format="PNG") # type: ignore
181
+ encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
182
+ return encoded_image
183
+
184
+
185
+ def encode_media(media: Union[str, Path]) -> str:
186
+ if isinstance(media, str) and media.startswith(("http", "https")):
187
+ # for mp4 video url, we assume there is a same url but ends with png
188
+ # vision-agent-ui will upload this png when uploading the video
189
+ if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
190
+ return media[:-4] + ".png"
191
+ return media
192
+
193
+ # if media is already a base64 encoded image return
194
+ if isinstance(media, str) and media.startswith("data:image/"):
195
+ return media
196
+
197
+ extension = "png"
198
+ extension = Path(media).suffix
199
+ if extension.lower() not in {
200
+ ".jpg",
201
+ ".jpeg",
202
+ ".png",
203
+ ".webp",
204
+ ".bmp",
205
+ ".mp4",
206
+ ".mov",
207
+ }:
208
+ raise ValueError(f"Unsupported image extension: {extension}")
209
+
210
+ image_bytes = b""
211
+ if extension.lower() in {".mp4", ".mov"}:
212
+ frames = extract_frames_from_video(str(media), fps=1)
213
+ image = frames[len(frames) // 2]
214
+ buffer = io.BytesIO()
215
+ Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
216
+ image_bytes = buffer.getvalue()
217
+ else:
218
+ image_bytes = open(media, "rb").read()
219
+ return encode_image_bytes(image_bytes)
220
+
221
+
175
222
  def denormalize_bbox(
176
223
  bbox: List[Union[int, float]], image_size: Tuple[int, ...]
177
224
  ) -> List[float]:
File without changes
File without changes
@@ -1,6 +1,6 @@
1
- import os
2
1
  import inspect
3
2
  import logging
3
+ import os
4
4
  from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
5
5
 
6
6
  import pandas as pd
@@ -10,10 +10,10 @@ from requests import Session
10
10
  from requests.adapters import HTTPAdapter
11
11
  from urllib3.util.retry import Retry
12
12
 
13
+ from vision_agent.tools.tools_types import BoundingBoxes
13
14
  from vision_agent.utils.exceptions import RemoteToolCallFailed
14
15
  from vision_agent.utils.execute import Error, MimeType
15
16
  from vision_agent.utils.type_defs import LandingaiAPIKey
16
- from vision_agent.tools.tools_types import BoundingBoxes
17
17
 
18
18
  _LOGGER = logging.getLogger(__name__)
19
19
  _LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)