vision-agent 0.2.116__py3-none-any.whl → 0.2.118__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -718,7 +718,12 @@ class VisionAgentCoder(Agent):
718
718
  for chat_i in chat:
719
719
  if "media" in chat_i:
720
720
  for media in chat_i["media"]:
721
- media = code_interpreter.upload_file(media)
721
+ media = (
722
+ media
723
+ if type(media) is str
724
+ and media.startswith(("http", "https"))
725
+ else code_interpreter.upload_file(media)
726
+ )
722
727
  chat_i["content"] += f" Media name {media}" # type: ignore
723
728
  media_list.append(media)
724
729
 
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
744
749
  results = {"code": "", "test": "", "plan": []}
745
750
  plan = []
746
751
  success = False
747
- self.log_progress(
748
- {
749
- "type": "log",
750
- "log_content": "Creating plans",
751
- "status": "started",
752
- }
753
- )
754
- plans = write_plans(
755
- int_chat,
756
- T.get_tool_descriptions_by_names(
757
- customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
758
- ),
759
- format_memory(working_memory),
760
- self.planner,
752
+
753
+ plans = self._create_plans(
754
+ int_chat, customized_tool_names, working_memory, self.planner
761
755
  )
762
756
 
763
- if self.verbosity >= 1:
764
- for p in plans:
765
- # tabulate will fail if the keys are not the same for all elements
766
- p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
767
- _LOGGER.info(
768
- f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
769
- )
757
+ if test_multi_plan:
758
+ self._log_plans(plans, self.verbosity)
759
+
770
760
  tool_infos = retrieve_tools(
771
761
  plans,
772
762
  self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
860
850
  if self.report_progress_callback is not None:
861
851
  self.report_progress_callback(data)
862
852
 
853
+ def _create_plans(
854
+ self,
855
+ int_chat: List[Message],
856
+ customized_tool_names: Optional[List[str]],
857
+ working_memory: List[Dict[str, str]],
858
+ planner: LMM,
859
+ ) -> Dict[str, Any]:
860
+ self.log_progress(
861
+ {
862
+ "type": "log",
863
+ "log_content": "Creating plans",
864
+ "status": "started",
865
+ }
866
+ )
867
+ plans = write_plans(
868
+ int_chat,
869
+ T.get_tool_descriptions_by_names(
870
+ customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
871
+ ),
872
+ format_memory(working_memory),
873
+ planner,
874
+ )
875
+ return plans
876
+
877
+ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
878
+ if verbosity >= 1:
879
+ for p in plans:
880
+ # tabulate will fail if the keys are not the same for all elements
881
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
882
+ _LOGGER.info(
883
+ f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
884
+ )
885
+
863
886
 
864
887
  class OllamaVisionAgentCoder(VisionAgentCoder):
865
888
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
vision_agent/lmm/lmm.py CHANGED
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
30
30
 
31
31
 
32
32
  def encode_media(media: Union[str, Path]) -> str:
33
+ if type(media) is str and media.startswith(("http", "https")):
34
+ # for mp4 video url, we assume there is a same url but ends with png
35
+ # vision-agent-ui will upload this png when uploading the video
36
+ if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
37
+ return media[:-4] + ".png"
38
+ return media
33
39
  extension = "png"
34
40
  extension = Path(media).suffix
35
41
  if extension.lower() not in {
@@ -138,7 +144,11 @@ class OpenAILMM(LMM):
138
144
  {
139
145
  "type": "image_url",
140
146
  "image_url": {
141
- "url": f"data:image/png;base64,{encoded_media}",
147
+ "url": (
148
+ encoded_media
149
+ if encoded_media.startswith(("http", "https"))
150
+ else f"data:image/png;base64,{encoded_media}"
151
+ ),
142
152
  "detail": "low",
143
153
  },
144
154
  },
@@ -390,7 +400,6 @@ class OllamaLMM(LMM):
390
400
  tmp_kwargs = self.kwargs | kwargs
391
401
  data.update(tmp_kwargs)
392
402
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
393
-
394
403
  json_data = json.dumps(data)
395
404
 
396
405
  def f() -> Iterator[Optional[str]]:
@@ -424,7 +433,6 @@ class OllamaLMM(LMM):
424
433
  media: Optional[List[Union[str, Path]]] = None,
425
434
  **kwargs: Any,
426
435
  ) -> Union[str, Iterator[Optional[str]]]:
427
-
428
436
  url = f"{self.url}/generate"
429
437
  data: Dict[str, Any] = {
430
438
  "model": self.model_name,
@@ -439,7 +447,6 @@ class OllamaLMM(LMM):
439
447
  tmp_kwargs = self.kwargs | kwargs
440
448
  data.update(tmp_kwargs)
441
449
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
442
-
443
450
  json_data = json.dumps(data)
444
451
 
445
452
  def f() -> Iterator[Optional[str]]:
@@ -21,7 +21,7 @@ from .tools import (
21
21
  dpt_hybrid_midas,
22
22
  extract_frames,
23
23
  florence2_image_caption,
24
- florence2_object_detection,
24
+ florence2_phrase_grounding,
25
25
  florence2_ocr,
26
26
  florence2_roberta_vqa,
27
27
  florence2_sam2_image,
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -14,6 +15,7 @@ from moviepy.editor import ImageSequenceClip
14
15
  from PIL import Image, ImageDraw, ImageFont
15
16
  from pillow_heif import register_heif_opener # type: ignore
16
17
  from pytube import YouTube # type: ignore
18
+ import urllib.request
17
19
 
18
20
  from vision_agent.clients.landing_public_api import LandingPublicAPI
19
21
  from vision_agent.tools.tool_utils import (
@@ -760,10 +762,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
760
762
  return answer[task] # type: ignore
761
763
 
762
764
 
763
- def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
764
- """'florencev2_object_detection' is a tool that can detect and count multiple
765
- objects given a text prompt such as category names or referring expressions. You
766
- can optionally separate the categories in the text with commas. It returns a list
765
+ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
766
+ """'florence2_phrase_grounding' is a tool that can detect multiple
767
+ objects given a text prompt which can be object names or caption. You
768
+ can optionally separate the object names in the text with commas. It returns a list
767
769
  of bounding boxes with normalized coordinates, label names and associated
768
770
  probability scores of 1.0.
769
771
 
@@ -780,7 +782,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
780
782
 
781
783
  Example
782
784
  -------
783
- >>> florence2_object_detection('person looking at a coyote', image)
785
+ >>> florence2_phrase_grounding('person looking at a coyote', image)
784
786
  [
785
787
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
786
788
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -792,7 +794,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
792
794
  "image": image_b64,
793
795
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
794
796
  "prompt": prompt,
795
- "function_name": "florence2_object_detection",
797
+ "function_name": "florence2_phrase_grounding",
796
798
  }
797
799
 
798
800
  detections = send_inference_request(data, "florence2", v2=True)
@@ -1220,6 +1222,13 @@ def extract_frames(
1220
1222
  video_file_path = video.download(output_path=temp_dir)
1221
1223
 
1222
1224
  return extract_frames_from_video(video_file_path, fps)
1225
+ elif str(video_uri).startswith(("http", "https")):
1226
+ _, image_suffix = os.path.splitext(video_uri)
1227
+ with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
1228
+ # Download the video and save it to the temporary file
1229
+ with urllib.request.urlopen(str(video_uri)) as response:
1230
+ tmp_file.write(response.read())
1231
+ return extract_frames_from_video(tmp_file.name, fps)
1223
1232
 
1224
1233
  return extract_frames_from_video(str(video_uri), fps)
1225
1234
 
@@ -1250,10 +1259,10 @@ def save_json(data: Any, file_path: str) -> None:
1250
1259
 
1251
1260
 
1252
1261
  def load_image(image_path: str) -> np.ndarray:
1253
- """'load_image' is a utility function that loads an image from the given file path string.
1262
+ """'load_image' is a utility function that loads an image from the given file path string or an URL.
1254
1263
 
1255
1264
  Parameters:
1256
- image_path (str): The path to the image.
1265
+ image_path (str): The path or URL to the image.
1257
1266
 
1258
1267
  Returns:
1259
1268
  np.ndarray: The image as a NumPy array.
@@ -1265,6 +1274,13 @@ def load_image(image_path: str) -> np.ndarray:
1265
1274
  # NOTE: sometimes the generated code pass in a NumPy array
1266
1275
  if isinstance(image_path, np.ndarray):
1267
1276
  return image_path
1277
+ if image_path.startswith(("http", "https")):
1278
+ _, image_suffix = os.path.splitext(image_path)
1279
+ with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
1280
+ # Download the image and save it to the temporary file
1281
+ with urllib.request.urlopen(image_path) as response:
1282
+ tmp_file.write(response.read())
1283
+ image_path = tmp_file.name
1268
1284
  image = Image.open(image_path).convert("RGB")
1269
1285
  return np.array(image)
1270
1286
 
@@ -1418,6 +1434,7 @@ def overlay_segmentation_masks(
1418
1434
  medias: Union[np.ndarray, List[np.ndarray]],
1419
1435
  masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
1420
1436
  draw_label: bool = True,
1437
+ secondary_label_key: str = "tracking_label",
1421
1438
  ) -> Union[np.ndarray, List[np.ndarray]]:
1422
1439
  """'overlay_segmentation_masks' is a utility function that displays segmentation
1423
1440
  masks.
@@ -1426,7 +1443,10 @@ def overlay_segmentation_masks(
1426
1443
  medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
1427
1444
  the masks on.
1428
1445
  masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
1429
- dictionaries containing the masks.
1446
+ dictionaries containing the masks, labels and scores.
1447
+ draw_label (bool, optional): If True, the labels will be displayed on the image.
1448
+ secondary_label_key (str, optional): The key to use for the secondary
1449
+ tracking label which is needed in videos to display tracking information.
1430
1450
 
1431
1451
  Returns:
1432
1452
  np.ndarray: The image with the masks displayed.
@@ -1471,6 +1491,7 @@ def overlay_segmentation_masks(
1471
1491
  for elt in masks_int[i]:
1472
1492
  mask = elt["mask"]
1473
1493
  label = elt["label"]
1494
+ tracking_lbl = elt.get(secondary_label_key, None)
1474
1495
  np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
1475
1496
  np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
1476
1497
  mask_img = Image.fromarray(np_mask.astype(np.uint8))
@@ -1478,16 +1499,17 @@ def overlay_segmentation_masks(
1478
1499
 
1479
1500
  if draw_label:
1480
1501
  draw = ImageDraw.Draw(pil_image)
1481
- text_box = draw.textbbox((0, 0), text=label, font=font)
1502
+ text = tracking_lbl if tracking_lbl else label
1503
+ text_box = draw.textbbox((0, 0), text=text, font=font)
1482
1504
  x, y = _get_text_coords_from_mask(
1483
1505
  mask,
1484
1506
  v_gap=(text_box[3] - text_box[1]) + 10,
1485
1507
  h_gap=(text_box[2] - text_box[0]) // 2,
1486
1508
  )
1487
1509
  if x != 0 and y != 0:
1488
- text_box = draw.textbbox((x, y), text=label, font=font)
1510
+ text_box = draw.textbbox((x, y), text=text, font=font)
1489
1511
  draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
1490
- draw.text((x, y), label, fill="black", font=font)
1512
+ draw.text((x, y), text, fill="black", font=font)
1491
1513
  frame_out.append(np.array(pil_image))
1492
1514
  return frame_out[0] if len(frame_out) == 1 else frame_out
1493
1515
 
@@ -1663,7 +1685,7 @@ FUNCTION_TOOLS = [
1663
1685
  florence2_ocr,
1664
1686
  florence2_sam2_image,
1665
1687
  florence2_sam2_video,
1666
- florence2_object_detection,
1688
+ florence2_phrase_grounding,
1667
1689
  ixc25_image_vqa,
1668
1690
  ixc25_video_vqa,
1669
1691
  detr_segmentation,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.116
3
+ Version: 0.2.118
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -252,7 +252,7 @@ function. Make sure the documentation is in the same format above with descripti
252
252
  `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
253
253
  [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
254
254
 
255
- ## Additional LLMs
255
+ ## Additional Backends
256
256
  ### Ollama
257
257
  We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
258
258
  a few models:
@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepov
3
3
  vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
4
4
  vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
5
5
  vision_agent/agent/vision_agent.py,sha256=5rgO-pScVOS3t4sWnLBnGYYkGftGgF4U0FpZzFVrDAY,8447
6
- vision_agent/agent/vision_agent_coder.py,sha256=qRSv_krY6-uHJC8exo3Nw0dPJ81jSzhKw2WTCHw1XVE,33733
6
+ vision_agent/agent/vision_agent_coder.py,sha256=tE-15ttnDxUsEdB0XJP4AVNyOU89KS8ZvXZDPcNKA-8,34380
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=xIya1txRZM8qoQHAWTEkEFCL8L3iZD7QD09t3ZtdxSE,11305
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=ydUU_Wvw-jqdL_vObSUr-VCQvjSwA5Fd74TbbhUzyxk,6112
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -12,13 +12,13 @@ vision_agent/clients/landing_public_api.py,sha256=6L15zh5lP5JHCpGnYpHMREgrrKiJin
12
12
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
14
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
- vision_agent/lmm/lmm.py,sha256=cuXtfFb7kJwVTyHTeK_t1bYItPiNjmDI2gF8vJs4gsM,20231
15
+ vision_agent/lmm/lmm.py,sha256=xkAxunToISzo5rCcjekqQBvm5SRW-98htieLuztKNbk,20802
16
16
  vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
17
- vision_agent/tools/__init__.py,sha256=Y6Y7McmdC8cm6UsJgExBLEPi4StBkqfY4y8_Mp7LlWU,2190
17
+ vision_agent/tools/__init__.py,sha256=lUUc2HV13eSxg5KPZop1D-mB4ecmiQ5fYlBTQLNSbYg,2190
18
18
  vision_agent/tools/meta_tools.py,sha256=q6h7hZarZrsWRloVE6PbTZwW8J2N1uUM9Ac-XxsT6hk,13365
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=qMsb9d8QtpXGgF9rpPO2dA390BewKdYO68oWKDu-TGg,6504
21
- vision_agent/tools/tools.py,sha256=JscejDn05jpYW6psPkRDesegPtZJshNWCncGFPOpI7c,58626
21
+ vision_agent/tools/tools.py,sha256=gAW6G9k1vzy8jwRACNnw2Vihsajm_oSlVJqd6E4JSRA,59957
22
22
  vision_agent/tools/tools_types.py,sha256=z6_XtUhWgh201yM7Z0CYtiLBEGdHPc_QUydMDHZ84EA,2216
23
23
  vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=c1LrmaHD331za8DbA1myJpgUmWoDzePaOK6-dsd
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.116.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.116.dist-info/METADATA,sha256=iUNOaT5grsrdL_2yCiUqhaBvXoWtuFdxGSFlsJYF-nQ,11993
32
- vision_agent-0.2.116.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.116.dist-info/RECORD,,
30
+ vision_agent-0.2.118.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.118.dist-info/METADATA,sha256=4ilO7j9MOLCtaNekUUVlhMNdDKMk02ecx7ipnXT9RC8,11997
32
+ vision_agent-0.2.118.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.118.dist-info/RECORD,,