vision-agent 0.2.111__tar.gz → 0.2.112__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.111 → vision_agent-0.2.112}/PKG-INFO +1 -1
  2. {vision_agent-0.2.111 → vision_agent-0.2.112}/pyproject.toml +1 -1
  3. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/agent_utils.py +3 -8
  4. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder.py +28 -20
  5. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder_prompts.py +9 -7
  6. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_prompts.py +11 -10
  7. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/__init__.py +8 -4
  8. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/meta_tools.py +0 -1
  9. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tool_utils.py +8 -3
  10. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tools.py +350 -110
  11. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/execute.py +0 -1
  12. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/image_utils.py +52 -0
  13. {vision_agent-0.2.111 → vision_agent-0.2.112}/LICENSE +0 -0
  14. {vision_agent-0.2.111 → vision_agent-0.2.112}/README.md +0 -0
  15. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/__init__.py +0 -0
  16. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/__init__.py +0 -0
  17. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/agent.py +0 -0
  18. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent.py +0 -0
  19. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/__init__.py +0 -0
  20. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/http.py +0 -0
  21. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/landing_public_api.py +1 -1
  22. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/fonts/__init__.py +0 -0
  23. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  24. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/__init__.py +0 -0
  25. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/lmm.py +0 -0
  26. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/types.py +0 -0
  27. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/prompts.py +0 -0
  28. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tools_types.py +0 -0
  29. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/__init__.py +0 -0
  30. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/exceptions.py +0 -0
  31. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/sim.py +0 -0
  32. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/type_defs.py +0 -0
  33. {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.111
3
+ Version: 0.2.112
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.111"
7
+ version = "0.2.112"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -4,14 +4,13 @@ import sys
4
4
  from typing import Any, Dict
5
5
 
6
6
  logging.basicConfig(stream=sys.stdout)
7
- _LOGGER = logging.getLogger(__name__)
8
7
 
9
8
 
10
9
  def extract_json(json_str: str) -> Dict[str, Any]:
11
10
  try:
11
+ json_str = json_str.replace("\n", " ")
12
12
  json_dict = json.loads(json_str)
13
13
  except json.JSONDecodeError:
14
- input_json_str = json_str
15
14
  if "```json" in json_str:
16
15
  json_str = json_str[json_str.find("```json") + len("```json") :]
17
16
  json_str = json_str[: json_str.find("```")]
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
19
18
  json_str = json_str[json_str.find("```") + len("```") :]
20
19
  # get the last ``` not one from an intermediate string
21
20
  json_str = json_str[: json_str.find("}```")]
22
- try:
23
- json_dict = json.loads(json_str)
24
- except json.JSONDecodeError as e:
25
- error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
26
- _LOGGER.exception(error_msg)
27
- raise ValueError(error_msg) from e
21
+
22
+ json_dict = json.loads(json_str)
28
23
  return json_dict # type: ignore
29
24
 
30
25
 
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import sys
6
6
  import tempfile
7
+ from json import JSONDecodeError
7
8
  from pathlib import Path
8
9
  from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
9
10
 
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
86
87
  def format_plans(plans: Dict[str, Any]) -> str:
87
88
  plan_str = ""
88
89
  for k, v in plans.items():
89
- plan_str += f"{k}:\n"
90
- plan_str += "-" + "\n-".join([e["instructions"] for e in v])
90
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
91
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
91
92
 
92
93
  return plan_str
93
94
 
@@ -228,13 +229,11 @@ def pick_plan(
228
229
  "status": "completed" if tool_output.success else "failed",
229
230
  }
230
231
  )
231
- tool_output_str = ""
232
- if len(tool_output.logs.stdout) > 0:
233
- tool_output_str = tool_output.logs.stdout[0]
232
+ tool_output_str = tool_output.text().strip()
234
233
 
235
234
  if verbosity == 2:
236
235
  _print_code("Code and test after attempted fix:", code)
237
- _LOGGER.info(f"Code execution result after attempte {count}")
236
+ _LOGGER.info(f"Code execution result after attempt {count}")
238
237
 
239
238
  count += 1
240
239
 
@@ -251,7 +250,21 @@ def pick_plan(
251
250
  tool_output=tool_output_str[:20_000],
252
251
  )
253
252
  chat[-1]["content"] = prompt
254
- best_plan = extract_json(model(chat, stream=False)) # type: ignore
253
+
254
+ count = 0
255
+ best_plan = None
256
+ while best_plan is None and count < max_retries:
257
+ try:
258
+ best_plan = extract_json(model(chat, stream=False)) # type: ignore
259
+ except JSONDecodeError as e:
260
+ _LOGGER.exception(
261
+ f"Error while extracting JSON during picking best plan {str(e)}"
262
+ )
263
+ pass
264
+ count += 1
265
+
266
+ if best_plan is None:
267
+ best_plan = {"best_plan": list(plans.keys())[0]}
255
268
 
256
269
  if verbosity >= 1:
257
270
  _LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
525
538
 
526
539
 
527
540
  def retrieve_tools(
528
- plans: Dict[str, List[Dict[str, str]]],
541
+ plans: Dict[str, Dict[str, Any]],
529
542
  tool_recommender: Sim,
530
543
  log_progress: Callable[[Dict[str, Any]], None],
531
544
  verbosity: int = 0,
@@ -542,8 +555,8 @@ def retrieve_tools(
542
555
  tool_lists: Dict[str, List[Dict[str, str]]] = {}
543
556
  for k, plan in plans.items():
544
557
  tool_lists[k] = []
545
- for task in plan:
546
- tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
558
+ for task in plan["instructions"]:
559
+ tools = tool_recommender.top_k(task, k=2, thresh=0.3)
547
560
  tool_info.extend([e["doc"] for e in tools])
548
561
  tool_desc.extend([e["desc"] for e in tools])
549
562
  tool_lists[k].extend(
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
737
750
  if self.verbosity >= 1:
738
751
  for p in plans:
739
752
  # tabulate will fail if the keys are not the same for all elements
740
- p_fixed = [
741
- {
742
- "instructions": (
743
- e["instructions"] if "instructions" in e else ""
744
- )
745
- }
746
- for e in plans[p]
747
- ]
753
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
748
754
  _LOGGER.info(
749
755
  f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
750
756
  )
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
793
799
  )
794
800
 
795
801
  if self.verbosity >= 1:
802
+ plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
796
803
  _LOGGER.info(
797
- f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
804
+ f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
798
805
  )
799
806
 
800
807
  results = write_and_test_code(
801
808
  chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
802
- plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
809
+ plan=f"\n{plan_i['thoughts']}\n-"
810
+ + "\n-".join([e for e in plan_i["instructions"]]),
803
811
  tool_info=tool_info,
804
812
  tool_output=tool_output_str,
805
813
  tool_utils=T.UTILITIES_DOCSTRING,
@@ -30,18 +30,19 @@ PLAN = """
30
30
 
31
31
  **Instructions**:
32
32
  1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33
- 2. Output three different plans each utilize a different strategy or tool.
33
+ 2. Output three different plans each utilize a different strategy or set of tools.
34
34
 
35
35
  Output a list of jsons in the following format
36
36
 
37
37
  ```json
38
38
  {{
39
39
  "plan1":
40
- [
41
- {{
42
- "instructions": str # what you should do in this task associated with a tool
43
- }}
44
- ],
40
+ {{
41
+ "thoughts": str # your thought process for choosing this plan
42
+ "instructions": [
43
+ str # what you should do in this task associated with a tool
44
+ ]
45
+ }},
45
46
  "plan2": ...,
46
47
  "plan3": ...
47
48
  }}
@@ -127,7 +128,8 @@ PICK_PLAN = """
127
128
 
128
129
  **Instructions**:
129
130
  1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
130
- 2. Output a JSON object with the following format:
131
+ 2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
132
+ 3. Output a JSON object with the following format:
131
133
  {{
132
134
  "thoughts": str # your thought process for choosing the best plan
133
135
  "best_plan": str # the best plan you have chosen
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
15
15
  **Examples**:
16
16
  Here is an example of how you can interact with a user and Actions to complete a task:
17
17
  --- START EXAMPLES ---
18
- [Current directory: /workspace/test]
18
+ [Current directory: /example/workspace]
19
19
  {examples}
20
20
  --- END EXAMPLES ---
21
21
 
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
27
27
  Here is the current conversation so far:
28
28
  --- START CONVERSATION ---
29
29
  [Current directory: {dir}]
30
+
30
31
  {conversation}
31
32
  """
32
33
 
33
34
  EXAMPLES_CODE1 = """
34
35
  USER: Can you detect the dogs in this image? Media name dog.jpg
35
36
 
36
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
37
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
37
38
 
38
39
  OBSERVATION:
39
- [File /workspace/test/dog_detector.py]
40
+ [File /example/workspace/dog_detector.py]
40
41
  0|from vision_agent.tools import load_image, owl_v2
41
42
  1|def detect_dogs(image_path: str):
42
43
  2| image = load_image(image_path)
@@ -44,7 +45,7 @@ OBSERVATION:
44
45
  4| return dogs
45
46
  [End of file]
46
47
 
47
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
48
+ AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
48
49
 
49
50
  OBSERVATION:
50
51
  ----- stdout -----
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
55
56
 
56
57
  USER: The the image only has one dog, can you fix this?
57
58
 
58
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
59
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
59
60
 
60
61
  OBSERVATION:
61
- [File /workspace/test/dog_detector.py]
62
+ [File /example/workspace/dog_detector.py]
62
63
  0|from vision_agent.tools import load_image, owl_v2
63
64
  1|def detect_dogs(image_path: str):
64
65
  2| image = load_image(image_path)
@@ -66,7 +67,7 @@ OBSERVATION:
66
67
  4| return dogs
67
68
  [End of file]
68
69
 
69
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
70
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
70
71
 
71
72
  OBSERVATION:
72
73
  ----- stdout -----
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
82
83
 
83
84
  USER: Yes you can use workers.png
84
85
 
85
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
86
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
86
87
 
87
88
  OBSERVATION:
88
- [File /workspace/test/code.py]
89
+ [File /example/workspace/code.py]
89
90
  0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
90
91
  1|def count_workers_with_helmets(image_path: str):
91
92
  2| image = load_image(image_path)
@@ -104,7 +105,7 @@ OBSERVATION:
104
105
  15| return count
105
106
  [End of file]
106
107
 
107
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
108
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
108
109
 
109
110
  OBSERVATION:
110
111
  ----- stdout -----
@@ -19,16 +19,20 @@ from .tools import (
19
19
  detr_segmentation,
20
20
  dpt_hybrid_midas,
21
21
  extract_frames,
22
- florencev2_image_caption,
23
- florencev2_object_detection,
24
- florencev2_roberta_vqa,
25
- florencev2_ocr,
22
+ florence2_image_caption,
23
+ florence2_object_detection,
24
+ florence2_ocr,
25
+ florence2_roberta_vqa,
26
+ florence2_sam2_image,
27
+ florence2_sam2_video,
26
28
  generate_pose_image,
27
29
  generate_soft_edge_image,
28
30
  get_tool_documentation,
29
31
  git_vqa_v2,
30
32
  grounding_dino,
31
33
  grounding_sam,
34
+ ixc25_image_vqa,
35
+ ixc25_video_vqa,
32
36
  load_image,
33
37
  loca_visual_prompt_counting,
34
38
  loca_zero_shot_counting,
@@ -8,7 +8,6 @@ from vision_agent.lmm.types import Message
8
8
  from vision_agent.tools.tool_utils import get_tool_documentation
9
9
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
10
10
 
11
-
12
11
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
13
12
 
14
13
  CURRENT_FILE = None
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  import logging
3
3
  import os
4
- from typing import Any, Callable, Dict, List, MutableMapping, Optional
4
+ from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
5
5
 
6
6
  import pandas as pd
7
7
  from IPython.display import display
@@ -31,6 +31,7 @@ class ToolCallTrace(BaseModel):
31
31
  def send_inference_request(
32
32
  payload: Dict[str, Any],
33
33
  endpoint_name: str,
34
+ files: Optional[List[Tuple[Any, ...]]] = None,
34
35
  v2: bool = False,
35
36
  metadata_payload: Optional[Dict[str, Any]] = None,
36
37
  ) -> Dict[str, Any]:
@@ -50,7 +51,7 @@ def send_inference_request(
50
51
  response={},
51
52
  error=None,
52
53
  )
53
- headers = {"Content-Type": "application/json", "apikey": _LND_API_KEY}
54
+ headers = {"apikey": _LND_API_KEY}
54
55
  if "TOOL_ENDPOINT_AUTH" in os.environ:
55
56
  headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
56
57
  headers.pop("apikey")
@@ -60,7 +61,11 @@ def send_inference_request(
60
61
  num_retry=3,
61
62
  headers=headers,
62
63
  )
63
- res = session.post(url, json=payload)
64
+
65
+ if files is not None:
66
+ res = session.post(url, data=payload, files=files)
67
+ else:
68
+ res = session.post(url, json=payload)
64
69
  if res.status_code != 200:
65
70
  tool_call_trace.error = Error(
66
71
  name="RemoteToolCallFailed",
@@ -2,47 +2,50 @@ import io
2
2
  import json
3
3
  import logging
4
4
  import tempfile
5
- from uuid import UUID
6
- from pathlib import Path
7
5
  from importlib import resources
6
+ from pathlib import Path
8
7
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
+ from uuid import UUID
9
9
 
10
10
  import cv2
11
- import requests
12
11
  import numpy as np
13
- from pytube import YouTube # type: ignore
12
+ import requests
14
13
  from moviepy.editor import ImageSequenceClip
15
14
  from PIL import Image, ImageDraw, ImageFont
16
15
  from pillow_heif import register_heif_opener # type: ignore
16
+ from pytube import YouTube # type: ignore
17
17
 
18
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
18
19
  from vision_agent.tools.tool_utils import (
19
- send_inference_request,
20
20
  get_tool_descriptions,
21
21
  get_tool_documentation,
22
22
  get_tools_df,
23
23
  get_tools_info,
24
+ send_inference_request,
25
+ )
26
+ from vision_agent.tools.tools_types import (
27
+ BboxInput,
28
+ BboxInputBase64,
29
+ FineTuning,
30
+ Florencev2FtRequest,
31
+ JobStatus,
32
+ PromptTask,
24
33
  )
25
- from vision_agent.utils.exceptions import FineTuneModelIsNotReady
26
34
  from vision_agent.utils import extract_frames_from_video
35
+ from vision_agent.utils.exceptions import FineTuneModelIsNotReady
27
36
  from vision_agent.utils.execute import FileSerializer, MimeType
28
37
  from vision_agent.utils.image_utils import (
29
38
  b64_to_pil,
39
+ convert_quad_box_to_bbox,
30
40
  convert_to_b64,
31
41
  denormalize_bbox,
42
+ frames_to_bytes,
32
43
  get_image_size,
33
44
  normalize_bbox,
34
- convert_quad_box_to_bbox,
45
+ numpy_to_bytes,
35
46
  rle_decode,
47
+ rle_decode_array,
36
48
  )
37
- from vision_agent.tools.tools_types import (
38
- BboxInput,
39
- BboxInputBase64,
40
- PromptTask,
41
- Florencev2FtRequest,
42
- FineTuning,
43
- JobStatus,
44
- )
45
- from vision_agent.clients.landing_public_api import LandingPublicAPI
46
49
 
47
50
  register_heif_opener()
48
51
 
@@ -141,9 +144,9 @@ def owl_v2(
141
144
  box_threshold: float = 0.10,
142
145
  ) -> List[Dict[str, Any]]:
143
146
  """'owl_v2' is a tool that can detect and count multiple objects given a text
144
- prompt such as category names or referring expressions. The categories in text prompt
145
- are separated by commas. It returns a list of bounding boxes with
146
- normalized coordinates, label names and associated probability scores.
147
+ prompt such as category names or referring expressions. The categories in text
148
+ prompt are separated by commas. It returns a list of bounding boxes with normalized
149
+ coordinates, label names and associated probability scores.
147
150
 
148
151
  Parameters:
149
152
  prompt (str): The prompt to ground to the image.
@@ -194,10 +197,10 @@ def grounding_sam(
194
197
  box_threshold: float = 0.20,
195
198
  iou_threshold: float = 0.20,
196
199
  ) -> List[Dict[str, Any]]:
197
- """'grounding_sam' is a tool that can segment multiple objects given a
198
- text prompt such as category names or referring expressions. The categories in text
199
- prompt are separated by commas or periods. It returns a list of bounding boxes,
200
- label names, mask file names and associated probability scores.
200
+ """'grounding_sam' is a tool that can segment multiple objects given a text prompt
201
+ such as category names or referring expressions. The categories in text prompt are
202
+ separated by commas or periods. It returns a list of bounding boxes, label names,
203
+ mask file names and associated probability scores.
201
204
 
202
205
  Parameters:
203
206
  prompt (str): The prompt to ground to the image.
@@ -254,52 +257,114 @@ def grounding_sam(
254
257
  return return_data
255
258
 
256
259
 
257
- def extract_frames(
258
- video_uri: Union[str, Path], fps: float = 0.5
259
- ) -> List[Tuple[np.ndarray, float]]:
260
- """'extract_frames' extracts frames from a video which can be a file path or youtube
261
- link, returns a list of tuples (frame, timestamp), where timestamp is the relative
262
- time in seconds where the frame was captured. The frame is a numpy array.
260
+ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
261
+ """'florence2_sam2_image' is a tool that can segment multiple objects given a text
262
+ prompt such as category names or referring expressions. The categories in the text
263
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
264
+ mask file names and associated probability scores of 1.0.
263
265
 
264
266
  Parameters:
265
- video_uri (Union[str, Path]): The path to the video file or youtube link
266
- fps (float, optional): The frame rate per second to extract the frames. Defaults
267
- to 0.5.
267
+ prompt (str): The prompt to ground to the image.
268
+ image (np.ndarray): The image to ground the prompt to.
268
269
 
269
270
  Returns:
270
- List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
271
- as a numpy array and the timestamp in seconds.
271
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
272
+ bounding box, and mask of the detected objects with normalized coordinates
273
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
274
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
275
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
276
+ the background.
272
277
 
273
278
  Example
274
279
  -------
275
- >>> extract_frames("path/to/video.mp4")
276
- [(frame1, 0.0), (frame2, 0.5), ...]
280
+ >>> florence2_sam2_image("car, dinosaur", image)
281
+ [
282
+ {
283
+ 'score': 1.0,
284
+ 'label': 'dinosaur',
285
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
286
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
287
+ [0, 0, 0, ..., 0, 0, 0],
288
+ ...,
289
+ [0, 0, 0, ..., 0, 0, 0],
290
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
291
+ },
292
+ ]
277
293
  """
294
+ buffer_bytes = numpy_to_bytes(image)
278
295
 
279
- if str(video_uri).startswith(
280
- (
281
- "http://www.youtube.com/",
282
- "https://www.youtube.com/",
283
- "http://youtu.be/",
284
- "https://youtu.be/",
285
- )
286
- ):
287
- with tempfile.TemporaryDirectory() as temp_dir:
288
- yt = YouTube(str(video_uri))
289
- # Download the highest resolution video
290
- video = (
291
- yt.streams.filter(progressive=True, file_extension="mp4")
292
- .order_by("resolution")
293
- .desc()
294
- .first()
295
- )
296
- if not video:
297
- raise Exception("No suitable video stream found")
298
- video_file_path = video.download(output_path=temp_dir)
296
+ files = [("image", buffer_bytes)]
297
+ payload = {
298
+ "prompts": [s.strip() for s in prompt.split(",")],
299
+ "function_name": "florence2_sam2_image",
300
+ }
301
+ data: Dict[str, Any] = send_inference_request(
302
+ payload, "florence2-sam2", files=files, v2=True
303
+ )
304
+ return_data = []
305
+ for _, data_i in data["0"].items():
306
+ mask = rle_decode_array(data_i["mask"])
307
+ label = data_i["label"]
308
+ bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
309
+ return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
310
+ return return_data
299
311
 
300
- return extract_frames_from_video(video_file_path, fps)
301
312
 
302
- return extract_frames_from_video(str(video_uri), fps)
313
+ def florence2_sam2_video(
314
+ prompt: str, frames: List[np.ndarray]
315
+ ) -> List[List[Dict[str, Any]]]:
316
+ """'florence2_sam2_video' is a tool that can segment and track multiple entities
317
+ in a video given a text prompt such as category names or referring expressions. You
318
+ can optionally separate the categories in the text with commas. It only tracks
319
+ entities present in the first frame and only returns segmentation masks. It is
320
+ useful for tracking and counting without duplicating counts.
321
+
322
+ Parameters:
323
+ prompt (str): The prompt to ground to the video.
324
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
325
+
326
+ Returns:
327
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
328
+ and segment mask. The outer list represents each frame and the inner list is
329
+ the entities per frame. The label contains the object ID followed by the label
330
+ name. The objects are only identified in the first framed and tracked
331
+ throughout the video.
332
+
333
+ Example
334
+ -------
335
+ >>> florence2_sam2_video("car, dinosaur", frames)
336
+ [
337
+ [
338
+ {
339
+ 'label': '0: dinosaur',
340
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
341
+ [0, 0, 0, ..., 0, 0, 0],
342
+ ...,
343
+ [0, 0, 0, ..., 0, 0, 0],
344
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
345
+ },
346
+ ],
347
+ ]
348
+ """
349
+
350
+ buffer_bytes = frames_to_bytes(frames)
351
+ files = [("video", buffer_bytes)]
352
+ payload = {
353
+ "prompts": prompt.split(","),
354
+ "function_name": "florence2_sam2_video",
355
+ }
356
+ data: Dict[str, Any] = send_inference_request(
357
+ payload, "florence2-sam2", files=files, v2=True
358
+ )
359
+ return_data = []
360
+ for frame_i in data.keys():
361
+ return_frame_data = []
362
+ for obj_id, data_j in data[frame_i].items():
363
+ mask = rle_decode_array(data_j["mask"])
364
+ label = obj_id + ": " + data_j["label"]
365
+ return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
366
+ return_data.append(return_frame_data)
367
+ return return_data
303
368
 
304
369
 
305
370
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -368,12 +433,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
368
433
 
369
434
  Returns:
370
435
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
371
- value. E.g. {count: 12}.
436
+ value, e.g. {count: 12} and a heat map for visaulization purposes.
372
437
 
373
438
  Example
374
439
  -------
375
440
  >>> loca_zero_shot_counting(image)
376
- {'count': 45},
441
+ {'count': 83,
442
+ 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
443
+ [ 0, 0, 0, ..., 0, 0, 0],
444
+ [ 0, 0, 0, ..., 0, 0, 1],
445
+ ...,
446
+ [ 0, 0, 0, ..., 30, 35, 41],
447
+ [ 0, 0, 0, ..., 41, 47, 53],
448
+ [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
377
449
  """
378
450
 
379
451
  image_b64 = convert_to_b64(image)
@@ -398,12 +470,19 @@ def loca_visual_prompt_counting(
398
470
 
399
471
  Returns:
400
472
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
401
- value. E.g. {count: 12}.
473
+ value, e.g. {count: 12} and a heat map for visaulization purposes.
402
474
 
403
475
  Example
404
476
  -------
405
477
  >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
406
- {'count': 45},
478
+ {'count': 83,
479
+ 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
480
+ [ 0, 0, 0, ..., 0, 0, 0],
481
+ [ 0, 0, 0, ..., 0, 0, 1],
482
+ ...,
483
+ [ 0, 0, 0, ..., 30, 35, 41],
484
+ [ 0, 0, 0, ..., 41, 47, 53],
485
+ [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
407
486
  """
408
487
 
409
488
  image_size = get_image_size(image)
@@ -420,8 +499,8 @@ def loca_visual_prompt_counting(
420
499
  return resp_data
421
500
 
422
501
 
423
- def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
424
- """'florencev2_roberta_vqa' is a tool that takes an image and analyzes
502
+ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
503
+ """'florence2_roberta_vqa' is a tool that takes an image and analyzes
425
504
  its contents, generates detailed captions and then tries to answer the given
426
505
  question using the generated context. It returns text as an answer to the question.
427
506
 
@@ -434,7 +513,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
434
513
 
435
514
  Example
436
515
  -------
437
- >>> florencev2_roberta_vqa('What is the top left animal in this image ?', image)
516
+ >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
438
517
  'white tiger'
439
518
  """
440
519
 
@@ -442,13 +521,73 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
442
521
  data = {
443
522
  "image": image_b64,
444
523
  "question": prompt,
445
- "function_name": "florencev2_roberta_vqa",
524
+ "function_name": "florence2_roberta_vqa",
446
525
  }
447
526
 
448
527
  answer = send_inference_request(data, "florence2-qa", v2=True)
449
528
  return answer # type: ignore
450
529
 
451
530
 
531
+ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
532
+ """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
533
+ including regular images or images of documents or presentations. It returns text
534
+ as an answer to the question.
535
+
536
+ Parameters:
537
+ prompt (str): The question about the image
538
+ image (np.ndarray): The reference image used for the question
539
+
540
+ Returns:
541
+ str: A string which is the answer to the given prompt.
542
+
543
+ Example
544
+ -------
545
+ >>> ixc25_image_vqa('What is the cat doing?', image)
546
+ 'drinking milk'
547
+ """
548
+
549
+ buffer_bytes = numpy_to_bytes(image)
550
+ files = [("image", buffer_bytes)]
551
+ payload = {
552
+ "prompt": prompt,
553
+ "function_name": "ixc25_image_vqa",
554
+ }
555
+ data: Dict[str, Any] = send_inference_request(
556
+ payload, "internlm-xcomposer2", files=files, v2=True
557
+ )
558
+ return cast(str, data["answer"])
559
+
560
+
561
+ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
562
+ """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
563
+ including regular videos or videos of documents or presentations. It returns text
564
+ as an answer to the question.
565
+
566
+ Parameters:
567
+ prompt (str): The question about the video
568
+ frames (List[np.ndarray]): The reference frames used for the question
569
+
570
+ Returns:
571
+ str: A string which is the answer to the given prompt.
572
+
573
+ Example
574
+ -------
575
+ >>> ixc25_video_vqa('Which football player made the goal?', frames)
576
+ 'Lionel Messi'
577
+ """
578
+
579
+ buffer_bytes = frames_to_bytes(frames)
580
+ files = [("video", buffer_bytes)]
581
+ payload = {
582
+ "prompt": prompt,
583
+ "function_name": "ixc25_video_vqa",
584
+ }
585
+ data: Dict[str, Any] = send_inference_request(
586
+ payload, "internlm-xcomposer2", files=files, v2=True
587
+ )
588
+ return cast(str, data["answer"])
589
+
590
+
452
591
  def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
453
592
  """'git_vqa_v2' is a tool that can answer questions about the visual
454
593
  contents of an image given a question and an image. It returns an answer to the
@@ -592,8 +731,8 @@ def blip_image_caption(image: np.ndarray) -> str:
592
731
  return answer["text"][0] # type: ignore
593
732
 
594
733
 
595
- def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
596
- """'florencev2_image_caption' is a tool that can caption or describe an image based
734
+ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
735
+ """'florence2_image_caption' is a tool that can caption or describe an image based
597
736
  on its contents. It returns a text describing the image.
598
737
 
599
738
  Parameters:
@@ -606,7 +745,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
606
745
 
607
746
  Example
608
747
  -------
609
- >>> florencev2_image_caption(image, False)
748
+ >>> florence2_image_caption(image, False)
610
749
  'This image contains a cat sitting on a table with a bowl of milk.'
611
750
  """
612
751
  image_b64 = convert_to_b64(image)
@@ -614,17 +753,19 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
614
753
  data = {
615
754
  "image": image_b64,
616
755
  "task": task,
617
- "function_name": "florencev2_image_caption",
756
+ "function_name": "florence2_image_caption",
618
757
  }
619
758
 
620
759
  answer = send_inference_request(data, "florence2", v2=True)
621
760
  return answer[task] # type: ignore
622
761
 
623
762
 
624
- def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
625
- """'florencev2_object_detection' is a tool that can detect objects given a text
626
- prompt such as a phrase or class names separated by commas. It returns a list of
627
- detected objects as labels and their location as bounding boxes with score of 1.0.
763
+ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
764
+ """'florencev2_object_detection' is a tool that can detect and count multiple
765
+ objects given a text prompt such as category names or referring expressions. You
766
+ can optionally separate the categories in the text with commas. It returns a list
767
+ of bounding boxes with normalized coordinates, label names and associated
768
+ probability scores of 1.0.
628
769
 
629
770
  Parameters:
630
771
  prompt (str): The prompt to ground to the image.
@@ -639,7 +780,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
639
780
 
640
781
  Example
641
782
  -------
642
- >>> florencev2_object_detection('person looking at a coyote', image)
783
+ >>> florence2_object_detection('person looking at a coyote', image)
643
784
  [
644
785
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
645
786
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -651,7 +792,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
651
792
  "image": image_b64,
652
793
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
653
794
  "prompt": prompt,
654
- "function_name": "florencev2_object_detection",
795
+ "function_name": "florence2_object_detection",
655
796
  }
656
797
 
657
798
  detections = send_inference_request(data, "florence2", v2=True)
@@ -668,8 +809,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
668
809
  return return_data
669
810
 
670
811
 
671
- def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
672
- """'florencev2_ocr' is a tool that can detect text and text regions in an image.
812
+ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
813
+ """'florence2_ocr' is a tool that can detect text and text regions in an image.
673
814
  Each text region contains one line of text. It returns a list of detected text,
674
815
  the text region as a bounding box with normalized coordinates, and confidence
675
816
  scores. The results are sorted from top-left to bottom right.
@@ -683,7 +824,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
683
824
 
684
825
  Example
685
826
  -------
686
- >>> florencev2_ocr(image)
827
+ >>> florence2_ocr(image)
687
828
  [
688
829
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
689
830
  ]
@@ -694,7 +835,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
694
835
  data = {
695
836
  "image": image_b64,
696
837
  "task": "<OCR_WITH_REGION>",
697
- "function_name": "florencev2_ocr",
838
+ "function_name": "florence2_ocr",
698
839
  }
699
840
 
700
841
  detections = send_inference_request(data, "florence2", v2=True)
@@ -1035,6 +1176,54 @@ def closest_box_distance(
1035
1176
  # Utility and visualization functions
1036
1177
 
1037
1178
 
1179
+ def extract_frames(
1180
+ video_uri: Union[str, Path], fps: float = 1
1181
+ ) -> List[Tuple[np.ndarray, float]]:
1182
+ """'extract_frames' extracts frames from a video which can be a file path or youtube
1183
+ link, returns a list of tuples (frame, timestamp), where timestamp is the relative
1184
+ time in seconds where the frame was captured. The frame is a numpy array.
1185
+
1186
+ Parameters:
1187
+ video_uri (Union[str, Path]): The path to the video file or youtube link
1188
+ fps (float, optional): The frame rate per second to extract the frames. Defaults
1189
+ to 10.
1190
+
1191
+ Returns:
1192
+ List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
1193
+ as a numpy array and the timestamp in seconds.
1194
+
1195
+ Example
1196
+ -------
1197
+ >>> extract_frames("path/to/video.mp4")
1198
+ [(frame1, 0.0), (frame2, 0.5), ...]
1199
+ """
1200
+
1201
+ if str(video_uri).startswith(
1202
+ (
1203
+ "http://www.youtube.com/",
1204
+ "https://www.youtube.com/",
1205
+ "http://youtu.be/",
1206
+ "https://youtu.be/",
1207
+ )
1208
+ ):
1209
+ with tempfile.TemporaryDirectory() as temp_dir:
1210
+ yt = YouTube(str(video_uri))
1211
+ # Download the highest resolution video
1212
+ video = (
1213
+ yt.streams.filter(progressive=True, file_extension="mp4")
1214
+ .order_by("resolution")
1215
+ .desc()
1216
+ .first()
1217
+ )
1218
+ if not video:
1219
+ raise Exception("No suitable video stream found")
1220
+ video_file_path = video.download(output_path=temp_dir)
1221
+
1222
+ return extract_frames_from_video(video_file_path, fps)
1223
+
1224
+ return extract_frames_from_video(str(video_uri), fps)
1225
+
1226
+
1038
1227
  def save_json(data: Any, file_path: str) -> None:
1039
1228
  """'save_json' is a utility function that saves data as a JSON file. It is helpful
1040
1229
  for saving data that contains NumPy arrays which are not JSON serializable.
@@ -1099,7 +1288,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
1099
1288
 
1100
1289
 
1101
1290
  def save_video(
1102
- frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 4
1291
+ frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
1103
1292
  ) -> str:
1104
1293
  """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
1105
1294
 
@@ -1201,15 +1390,43 @@ def overlay_bounding_boxes(
1201
1390
  return np.array(pil_image)
1202
1391
 
1203
1392
 
1393
+ def _get_text_coords_from_mask(
1394
+ mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
1395
+ ) -> Tuple[int, int]:
1396
+ mask = mask.astype(np.uint8)
1397
+ if np.sum(mask) == 0:
1398
+ return (0, 0)
1399
+
1400
+ rows, cols = np.nonzero(mask)
1401
+ top = rows.min()
1402
+ bottom = rows.max()
1403
+ left = cols.min()
1404
+ right = cols.max()
1405
+
1406
+ if top - v_gap < 0:
1407
+ if bottom + v_gap > mask.shape[0]:
1408
+ top = top
1409
+ else:
1410
+ top = bottom + v_gap
1411
+ else:
1412
+ top = top - v_gap
1413
+
1414
+ return left + (right - left) // 2 - h_gap, top
1415
+
1416
+
1204
1417
  def overlay_segmentation_masks(
1205
- image: np.ndarray, masks: List[Dict[str, Any]]
1206
- ) -> np.ndarray:
1418
+ medias: Union[np.ndarray, List[np.ndarray]],
1419
+ masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
1420
+ draw_label: bool = True,
1421
+ ) -> Union[np.ndarray, List[np.ndarray]]:
1207
1422
  """'overlay_segmentation_masks' is a utility function that displays segmentation
1208
1423
  masks.
1209
1424
 
1210
1425
  Parameters:
1211
- image (np.ndarray): The image to display the masks on.
1212
- masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
1426
+ medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
1427
+ the masks on.
1428
+ masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
1429
+ dictionaries containing the masks.
1213
1430
 
1214
1431
  Returns:
1215
1432
  np.ndarray: The image with the masks displayed.
@@ -1229,27 +1446,50 @@ def overlay_segmentation_masks(
1229
1446
  }],
1230
1447
  )
1231
1448
  """
1232
- pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
1449
+ medias_int: List[np.ndarray] = (
1450
+ [medias] if isinstance(medias, np.ndarray) else medias
1451
+ )
1452
+ masks_int = [masks] if isinstance(masks[0], dict) else masks
1453
+ masks_int = cast(List[List[Dict[str, Any]]], masks_int)
1233
1454
 
1234
- if len(set([mask["label"] for mask in masks])) > len(COLORS):
1235
- _LOGGER.warning(
1236
- "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
1237
- )
1455
+ labels = set()
1456
+ for mask_i in masks_int:
1457
+ for mask_j in mask_i:
1458
+ labels.add(mask_j["label"])
1459
+ color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
1238
1460
 
1239
- color = {
1240
- label: COLORS[i % len(COLORS)]
1241
- for i, label in enumerate(set([mask["label"] for mask in masks]))
1242
- }
1243
- masks = sorted(masks, key=lambda x: x["label"], reverse=True)
1461
+ width, height = Image.fromarray(medias_int[0]).size
1462
+ fontsize = max(12, int(min(width, height) / 40))
1463
+ font = ImageFont.truetype(
1464
+ str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
1465
+ fontsize,
1466
+ )
1244
1467
 
1245
- for elt in masks:
1246
- mask = elt["mask"]
1247
- label = elt["label"]
1248
- np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
1249
- np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
1250
- mask_img = Image.fromarray(np_mask.astype(np.uint8))
1251
- pil_image = Image.alpha_composite(pil_image, mask_img)
1252
- return np.array(pil_image)
1468
+ frame_out = []
1469
+ for i, frame in enumerate(medias_int):
1470
+ pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
1471
+ for elt in masks_int[i]:
1472
+ mask = elt["mask"]
1473
+ label = elt["label"]
1474
+ np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
1475
+ np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
1476
+ mask_img = Image.fromarray(np_mask.astype(np.uint8))
1477
+ pil_image = Image.alpha_composite(pil_image, mask_img)
1478
+
1479
+ if draw_label:
1480
+ draw = ImageDraw.Draw(pil_image)
1481
+ text_box = draw.textbbox((0, 0), text=label, font=font)
1482
+ x, y = _get_text_coords_from_mask(
1483
+ mask,
1484
+ v_gap=(text_box[3] - text_box[1]) + 10,
1485
+ h_gap=(text_box[2] - text_box[0]) // 2,
1486
+ )
1487
+ if x != 0 and y != 0:
1488
+ text_box = draw.textbbox((x, y), text=label, font=font)
1489
+ draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
1490
+ draw.text((x, y), label, fill="black", font=font)
1491
+ frame_out.append(np.array(pil_image))
1492
+ return frame_out[0] if len(frame_out) == 1 else frame_out
1253
1493
 
1254
1494
 
1255
1495
  def overlay_heat_map(
@@ -1412,7 +1652,6 @@ def florencev2_fine_tuned_object_detection(
1412
1652
 
1413
1653
  TOOLS = [
1414
1654
  owl_v2,
1415
- grounding_sam,
1416
1655
  extract_frames,
1417
1656
  ocr,
1418
1657
  clip,
@@ -1420,13 +1659,15 @@ TOOLS = [
1420
1659
  vit_nsfw_classification,
1421
1660
  loca_zero_shot_counting,
1422
1661
  loca_visual_prompt_counting,
1423
- florencev2_roberta_vqa,
1424
- florencev2_image_caption,
1425
- florencev2_ocr,
1662
+ florence2_image_caption,
1663
+ florence2_ocr,
1664
+ florence2_sam2_image,
1665
+ florence2_sam2_video,
1666
+ florence2_object_detection,
1667
+ ixc25_image_vqa,
1668
+ ixc25_video_vqa,
1426
1669
  detr_segmentation,
1427
1670
  depth_anything_v2,
1428
- generate_soft_edge_image,
1429
- dpt_hybrid_midas,
1430
1671
  generate_pose_image,
1431
1672
  closest_mask_distance,
1432
1673
  closest_box_distance,
@@ -1437,7 +1678,6 @@ TOOLS = [
1437
1678
  overlay_bounding_boxes,
1438
1679
  overlay_segmentation_masks,
1439
1680
  overlay_heat_map,
1440
- template_match,
1441
1681
  ]
1442
1682
  TOOLS_DF = get_tools_df(TOOLS) # type: ignore
1443
1683
  TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
@@ -416,7 +416,6 @@ class CodeInterpreter(abc.ABC):
416
416
 
417
417
 
418
418
  class E2BCodeInterpreter(CodeInterpreter):
419
-
420
419
  def __init__(self, *args: Any, **kwargs: Any) -> None:
421
420
  super().__init__(*args, **kwargs)
422
421
  assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"
@@ -1,12 +1,15 @@
1
1
  """Utility functions for image processing."""
2
2
 
3
3
  import base64
4
+ import io
5
+ import tempfile
4
6
  from importlib import resources
5
7
  from io import BytesIO
6
8
  from pathlib import Path
7
9
  from typing import Dict, List, Tuple, Union
8
10
 
9
11
  import numpy as np
12
+ from moviepy.editor import ImageSequenceClip
10
13
  from PIL import Image, ImageDraw, ImageFont
11
14
  from PIL.Image import Image as ImageType
12
15
 
@@ -63,6 +66,46 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
63
66
  return img.reshape(shape)
64
67
 
65
68
 
69
+ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
70
+ r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
71
+
72
+ Parameters:
73
+ mask: The mask in run-length encoded as an array.
74
+ """
75
+ size = rle["size"]
76
+ counts = rle["counts"]
77
+
78
+ total_elements = size[0] * size[1]
79
+ flattened_mask = np.zeros(total_elements, dtype=np.uint8)
80
+
81
+ current_pos = 0
82
+ for i, count in enumerate(counts):
83
+ if i % 2 == 1:
84
+ flattened_mask[current_pos : current_pos + count] = 1
85
+ current_pos += count
86
+
87
+ binary_mask = flattened_mask.reshape(size, order="F")
88
+ return binary_mask
89
+
90
+
91
+ def frames_to_bytes(
92
+ frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
93
+ ) -> bytes:
94
+ r"""Convert a list of frames to a video file encoded into a byte string.
95
+
96
+ Parameters:
97
+ frames: the list of frames
98
+ fps: the frames per second of the video
99
+ file_ext: the file extension of the video file
100
+ """
101
+ with tempfile.NamedTemporaryFile(delete=True) as temp_file:
102
+ clip = ImageSequenceClip(frames, fps=fps)
103
+ clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps)
104
+ with open(temp_file.name + f".{file_ext}", "rb") as f:
105
+ buffer_bytes = f.read()
106
+ return buffer_bytes
107
+
108
+
66
109
  def b64_to_pil(b64_str: str) -> ImageType:
67
110
  r"""Convert a base64 string to a PIL Image.
68
111
 
@@ -78,6 +121,15 @@ def b64_to_pil(b64_str: str) -> ImageType:
78
121
  return Image.open(BytesIO(base64.b64decode(b64_str)))
79
122
 
80
123
 
124
+ def numpy_to_bytes(image: np.ndarray) -> bytes:
125
+ pil_image = Image.fromarray(image).convert("RGB")
126
+ image_buffer = io.BytesIO()
127
+ pil_image.save(image_buffer, format="PNG")
128
+ buffer_bytes = image_buffer.getvalue()
129
+ image_buffer.close()
130
+ return buffer_bytes
131
+
132
+
81
133
  def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
82
134
  r"""Get the size of an image.
83
135
 
File without changes
File without changes
@@ -1,6 +1,6 @@
1
1
  import os
2
- from uuid import UUID
3
2
  from typing import List
3
+ from uuid import UUID
4
4
 
5
5
  from requests.exceptions import HTTPError
6
6