vision-agent 0.2.116__py3-none-any.whl → 0.2.118__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -718,7 +718,12 @@ class VisionAgentCoder(Agent):
718
718
  for chat_i in chat:
719
719
  if "media" in chat_i:
720
720
  for media in chat_i["media"]:
721
- media = code_interpreter.upload_file(media)
721
+ media = (
722
+ media
723
+ if type(media) is str
724
+ and media.startswith(("http", "https"))
725
+ else code_interpreter.upload_file(media)
726
+ )
722
727
  chat_i["content"] += f" Media name {media}" # type: ignore
723
728
  media_list.append(media)
724
729
 
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
744
749
  results = {"code": "", "test": "", "plan": []}
745
750
  plan = []
746
751
  success = False
747
- self.log_progress(
748
- {
749
- "type": "log",
750
- "log_content": "Creating plans",
751
- "status": "started",
752
- }
753
- )
754
- plans = write_plans(
755
- int_chat,
756
- T.get_tool_descriptions_by_names(
757
- customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
758
- ),
759
- format_memory(working_memory),
760
- self.planner,
752
+
753
+ plans = self._create_plans(
754
+ int_chat, customized_tool_names, working_memory, self.planner
761
755
  )
762
756
 
763
- if self.verbosity >= 1:
764
- for p in plans:
765
- # tabulate will fail if the keys are not the same for all elements
766
- p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
767
- _LOGGER.info(
768
- f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
769
- )
757
+ if test_multi_plan:
758
+ self._log_plans(plans, self.verbosity)
759
+
770
760
  tool_infos = retrieve_tools(
771
761
  plans,
772
762
  self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
860
850
  if self.report_progress_callback is not None:
861
851
  self.report_progress_callback(data)
862
852
 
853
+ def _create_plans(
854
+ self,
855
+ int_chat: List[Message],
856
+ customized_tool_names: Optional[List[str]],
857
+ working_memory: List[Dict[str, str]],
858
+ planner: LMM,
859
+ ) -> Dict[str, Any]:
860
+ self.log_progress(
861
+ {
862
+ "type": "log",
863
+ "log_content": "Creating plans",
864
+ "status": "started",
865
+ }
866
+ )
867
+ plans = write_plans(
868
+ int_chat,
869
+ T.get_tool_descriptions_by_names(
870
+ customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
871
+ ),
872
+ format_memory(working_memory),
873
+ planner,
874
+ )
875
+ return plans
876
+
877
+ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
878
+ if verbosity >= 1:
879
+ for p in plans:
880
+ # tabulate will fail if the keys are not the same for all elements
881
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
882
+ _LOGGER.info(
883
+ f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
884
+ )
885
+
863
886
 
864
887
  class OllamaVisionAgentCoder(VisionAgentCoder):
865
888
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
vision_agent/lmm/lmm.py CHANGED
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
30
30
 
31
31
 
32
32
  def encode_media(media: Union[str, Path]) -> str:
33
+ if type(media) is str and media.startswith(("http", "https")):
34
+ # for mp4 video url, we assume there is a same url but ends with png
35
+ # vision-agent-ui will upload this png when uploading the video
36
+ if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
37
+ return media[:-4] + ".png"
38
+ return media
33
39
  extension = "png"
34
40
  extension = Path(media).suffix
35
41
  if extension.lower() not in {
@@ -138,7 +144,11 @@ class OpenAILMM(LMM):
138
144
  {
139
145
  "type": "image_url",
140
146
  "image_url": {
141
- "url": f"data:image/png;base64,{encoded_media}",
147
+ "url": (
148
+ encoded_media
149
+ if encoded_media.startswith(("http", "https"))
150
+ else f"data:image/png;base64,{encoded_media}"
151
+ ),
142
152
  "detail": "low",
143
153
  },
144
154
  },
@@ -390,7 +400,6 @@ class OllamaLMM(LMM):
390
400
  tmp_kwargs = self.kwargs | kwargs
391
401
  data.update(tmp_kwargs)
392
402
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
393
-
394
403
  json_data = json.dumps(data)
395
404
 
396
405
  def f() -> Iterator[Optional[str]]:
@@ -424,7 +433,6 @@ class OllamaLMM(LMM):
424
433
  media: Optional[List[Union[str, Path]]] = None,
425
434
  **kwargs: Any,
426
435
  ) -> Union[str, Iterator[Optional[str]]]:
427
-
428
436
  url = f"{self.url}/generate"
429
437
  data: Dict[str, Any] = {
430
438
  "model": self.model_name,
@@ -439,7 +447,6 @@ class OllamaLMM(LMM):
439
447
  tmp_kwargs = self.kwargs | kwargs
440
448
  data.update(tmp_kwargs)
441
449
  if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
442
-
443
450
  json_data = json.dumps(data)
444
451
 
445
452
  def f() -> Iterator[Optional[str]]:
@@ -21,7 +21,7 @@ from .tools import (
21
21
  dpt_hybrid_midas,
22
22
  extract_frames,
23
23
  florence2_image_caption,
24
- florence2_object_detection,
24
+ florence2_phrase_grounding,
25
25
  florence2_ocr,
26
26
  florence2_roberta_vqa,
27
27
  florence2_sam2_image,
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -14,6 +15,7 @@ from moviepy.editor import ImageSequenceClip
14
15
  from PIL import Image, ImageDraw, ImageFont
15
16
  from pillow_heif import register_heif_opener # type: ignore
16
17
  from pytube import YouTube # type: ignore
18
+ import urllib.request
17
19
 
18
20
  from vision_agent.clients.landing_public_api import LandingPublicAPI
19
21
  from vision_agent.tools.tool_utils import (
@@ -760,10 +762,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
760
762
  return answer[task] # type: ignore
761
763
 
762
764
 
763
- def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
764
- """'florencev2_object_detection' is a tool that can detect and count multiple
765
- objects given a text prompt such as category names or referring expressions. You
766
- can optionally separate the categories in the text with commas. It returns a list
765
+ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
766
+ """'florence2_phrase_grounding' is a tool that can detect multiple
767
+ objects given a text prompt which can be object names or caption. You
768
+ can optionally separate the object names in the text with commas. It returns a list
767
769
  of bounding boxes with normalized coordinates, label names and associated
768
770
  probability scores of 1.0.
769
771
 
@@ -780,7 +782,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
780
782
 
781
783
  Example
782
784
  -------
783
- >>> florence2_object_detection('person looking at a coyote', image)
785
+ >>> florence2_phrase_grounding('person looking at a coyote', image)
784
786
  [
785
787
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
786
788
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -792,7 +794,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
792
794
  "image": image_b64,
793
795
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
794
796
  "prompt": prompt,
795
- "function_name": "florence2_object_detection",
797
+ "function_name": "florence2_phrase_grounding",
796
798
  }
797
799
 
798
800
  detections = send_inference_request(data, "florence2", v2=True)
@@ -1220,6 +1222,13 @@ def extract_frames(
1220
1222
  video_file_path = video.download(output_path=temp_dir)
1221
1223
 
1222
1224
  return extract_frames_from_video(video_file_path, fps)
1225
+ elif str(video_uri).startswith(("http", "https")):
1226
+ _, image_suffix = os.path.splitext(video_uri)
1227
+ with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
1228
+ # Download the video and save it to the temporary file
1229
+ with urllib.request.urlopen(str(video_uri)) as response:
1230
+ tmp_file.write(response.read())
1231
+ return extract_frames_from_video(tmp_file.name, fps)
1223
1232
 
1224
1233
  return extract_frames_from_video(str(video_uri), fps)
1225
1234
 
@@ -1250,10 +1259,10 @@ def save_json(data: Any, file_path: str) -> None:
1250
1259
 
1251
1260
 
1252
1261
  def load_image(image_path: str) -> np.ndarray:
1253
- """'load_image' is a utility function that loads an image from the given file path string.
1262
+ """'load_image' is a utility function that loads an image from the given file path string or an URL.
1254
1263
 
1255
1264
  Parameters:
1256
- image_path (str): The path to the image.
1265
+ image_path (str): The path or URL to the image.
1257
1266
 
1258
1267
  Returns:
1259
1268
  np.ndarray: The image as a NumPy array.
@@ -1265,6 +1274,13 @@ def load_image(image_path: str) -> np.ndarray:
1265
1274
  # NOTE: sometimes the generated code pass in a NumPy array
1266
1275
  if isinstance(image_path, np.ndarray):
1267
1276
  return image_path
1277
+ if image_path.startswith(("http", "https")):
1278
+ _, image_suffix = os.path.splitext(image_path)
1279
+ with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
1280
+ # Download the image and save it to the temporary file
1281
+ with urllib.request.urlopen(image_path) as response:
1282
+ tmp_file.write(response.read())
1283
+ image_path = tmp_file.name
1268
1284
  image = Image.open(image_path).convert("RGB")
1269
1285
  return np.array(image)
1270
1286
 
@@ -1418,6 +1434,7 @@ def overlay_segmentation_masks(
1418
1434
  medias: Union[np.ndarray, List[np.ndarray]],
1419
1435
  masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
1420
1436
  draw_label: bool = True,
1437
+ secondary_label_key: str = "tracking_label",
1421
1438
  ) -> Union[np.ndarray, List[np.ndarray]]:
1422
1439
  """'overlay_segmentation_masks' is a utility function that displays segmentation
1423
1440
  masks.
@@ -1426,7 +1443,10 @@ def overlay_segmentation_masks(
1426
1443
  medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
1427
1444
  the masks on.
1428
1445
  masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
1429
- dictionaries containing the masks.
1446
+ dictionaries containing the masks, labels and scores.
1447
+ draw_label (bool, optional): If True, the labels will be displayed on the image.
1448
+ secondary_label_key (str, optional): The key to use for the secondary
1449
+ tracking label which is needed in videos to display tracking information.
1430
1450
 
1431
1451
  Returns:
1432
1452
  np.ndarray: The image with the masks displayed.
@@ -1471,6 +1491,7 @@ def overlay_segmentation_masks(
1471
1491
  for elt in masks_int[i]:
1472
1492
  mask = elt["mask"]
1473
1493
  label = elt["label"]
1494
+ tracking_lbl = elt.get(secondary_label_key, None)
1474
1495
  np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
1475
1496
  np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
1476
1497
  mask_img = Image.fromarray(np_mask.astype(np.uint8))
@@ -1478,16 +1499,17 @@ def overlay_segmentation_masks(
1478
1499
 
1479
1500
  if draw_label:
1480
1501
  draw = ImageDraw.Draw(pil_image)
1481
- text_box = draw.textbbox((0, 0), text=label, font=font)
1502
+ text = tracking_lbl if tracking_lbl else label
1503
+ text_box = draw.textbbox((0, 0), text=text, font=font)
1482
1504
  x, y = _get_text_coords_from_mask(
1483
1505
  mask,
1484
1506
  v_gap=(text_box[3] - text_box[1]) + 10,
1485
1507
  h_gap=(text_box[2] - text_box[0]) // 2,
1486
1508
  )
1487
1509
  if x != 0 and y != 0:
1488
- text_box = draw.textbbox((x, y), text=label, font=font)
1510
+ text_box = draw.textbbox((x, y), text=text, font=font)
1489
1511
  draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
1490
- draw.text((x, y), label, fill="black", font=font)
1512
+ draw.text((x, y), text, fill="black", font=font)
1491
1513
  frame_out.append(np.array(pil_image))
1492
1514
  return frame_out[0] if len(frame_out) == 1 else frame_out
1493
1515
 
@@ -1663,7 +1685,7 @@ FUNCTION_TOOLS = [
1663
1685
  florence2_ocr,
1664
1686
  florence2_sam2_image,
1665
1687
  florence2_sam2_video,
1666
- florence2_object_detection,
1688
+ florence2_phrase_grounding,
1667
1689
  ixc25_image_vqa,
1668
1690
  ixc25_video_vqa,
1669
1691
  detr_segmentation,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.116
3
+ Version: 0.2.118
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -252,7 +252,7 @@ function. Make sure the documentation is in the same format above with descripti
252
252
  `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
253
253
  [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
254
254
 
255
- ## Additional LLMs
255
+ ## Additional Backends
256
256
  ### Ollama
257
257
  We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
258
258
  a few models:
@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepov
3
3
  vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
4
4
  vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
5
5
  vision_agent/agent/vision_agent.py,sha256=5rgO-pScVOS3t4sWnLBnGYYkGftGgF4U0FpZzFVrDAY,8447
6
- vision_agent/agent/vision_agent_coder.py,sha256=qRSv_krY6-uHJC8exo3Nw0dPJ81jSzhKw2WTCHw1XVE,33733
6
+ vision_agent/agent/vision_agent_coder.py,sha256=tE-15ttnDxUsEdB0XJP4AVNyOU89KS8ZvXZDPcNKA-8,34380
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=xIya1txRZM8qoQHAWTEkEFCL8L3iZD7QD09t3ZtdxSE,11305
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=ydUU_Wvw-jqdL_vObSUr-VCQvjSwA5Fd74TbbhUzyxk,6112
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -12,13 +12,13 @@ vision_agent/clients/landing_public_api.py,sha256=6L15zh5lP5JHCpGnYpHMREgrrKiJin
12
12
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
14
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
- vision_agent/lmm/lmm.py,sha256=cuXtfFb7kJwVTyHTeK_t1bYItPiNjmDI2gF8vJs4gsM,20231
15
+ vision_agent/lmm/lmm.py,sha256=xkAxunToISzo5rCcjekqQBvm5SRW-98htieLuztKNbk,20802
16
16
  vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
17
- vision_agent/tools/__init__.py,sha256=Y6Y7McmdC8cm6UsJgExBLEPi4StBkqfY4y8_Mp7LlWU,2190
17
+ vision_agent/tools/__init__.py,sha256=lUUc2HV13eSxg5KPZop1D-mB4ecmiQ5fYlBTQLNSbYg,2190
18
18
  vision_agent/tools/meta_tools.py,sha256=q6h7hZarZrsWRloVE6PbTZwW8J2N1uUM9Ac-XxsT6hk,13365
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=qMsb9d8QtpXGgF9rpPO2dA390BewKdYO68oWKDu-TGg,6504
21
- vision_agent/tools/tools.py,sha256=JscejDn05jpYW6psPkRDesegPtZJshNWCncGFPOpI7c,58626
21
+ vision_agent/tools/tools.py,sha256=gAW6G9k1vzy8jwRACNnw2Vihsajm_oSlVJqd6E4JSRA,59957
22
22
  vision_agent/tools/tools_types.py,sha256=z6_XtUhWgh201yM7Z0CYtiLBEGdHPc_QUydMDHZ84EA,2216
23
23
  vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=c1LrmaHD331za8DbA1myJpgUmWoDzePaOK6-dsd
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.116.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.116.dist-info/METADATA,sha256=iUNOaT5grsrdL_2yCiUqhaBvXoWtuFdxGSFlsJYF-nQ,11993
32
- vision_agent-0.2.116.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.116.dist-info/RECORD,,
30
+ vision_agent-0.2.118.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.118.dist-info/METADATA,sha256=4ilO7j9MOLCtaNekUUVlhMNdDKMk02ecx7ipnXT9RC8,11997
32
+ vision_agent-0.2.118.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.118.dist-info/RECORD,,