vision-agent 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {vision_agent-0.2.5 → vision_agent-0.2.7}/PKG-INFO +1 -1
  2. {vision_agent-0.2.5 → vision_agent-0.2.7}/pyproject.toml +1 -1
  3. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/vision_agent.py +26 -10
  4. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/image_utils.py +5 -7
  5. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/tools.py +7 -2
  6. {vision_agent-0.2.5 → vision_agent-0.2.7}/LICENSE +0 -0
  7. {vision_agent-0.2.5 → vision_agent-0.2.7}/README.md +0 -0
  8. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/__init__.py +0 -0
  9. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/__init__.py +0 -0
  10. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/agent.py +0 -0
  11. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/easytool.py +0 -0
  12. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/easytool_prompts.py +0 -0
  13. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/reflexion.py +0 -0
  14. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/reflexion_prompts.py +0 -0
  15. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/vision_agent_prompts.py +0 -0
  16. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/fonts/__init__.py +0 -0
  17. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  18. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/llm/__init__.py +0 -0
  19. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/llm/llm.py +0 -0
  20. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/lmm/__init__.py +0 -0
  21. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/lmm/lmm.py +0 -0
  22. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/__init__.py +0 -0
  23. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/prompts.py +0 -0
  24. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/video.py +0 -0
  25. {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/type_defs.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.5"
7
+ version = "0.2.7"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -314,6 +314,7 @@ def _handle_extract_frames(
314
314
  image_to_data[image] = {
315
315
  "bboxes": [],
316
316
  "masks": [],
317
+ "heat_map": [],
317
318
  "labels": [],
318
319
  "scores": [],
319
320
  }
@@ -340,9 +341,12 @@ def _handle_viz_tools(
340
341
  return image_to_data
341
342
 
342
343
  for param, call_result in zip(parameters, tool_result["call_results"]):
343
- # calls can fail, so we need to check if the call was successful
344
+ # Calls can fail, so we need to check if the call was successful. It can either:
345
+ # 1. return a str or some error that's not a dictionary
346
+ # 2. return a dictionary but not have the necessary keys
347
+
344
348
  if not isinstance(call_result, dict) or (
345
- "bboxes" not in call_result and "masks" not in call_result
349
+ "bboxes" not in call_result and "heat_map" not in call_result
346
350
  ):
347
351
  return image_to_data
348
352
 
@@ -352,6 +356,7 @@ def _handle_viz_tools(
352
356
  image_to_data[image] = {
353
357
  "bboxes": [],
354
358
  "masks": [],
359
+ "heat_map": [],
355
360
  "labels": [],
356
361
  "scores": [],
357
362
  }
@@ -360,6 +365,8 @@ def _handle_viz_tools(
360
365
  image_to_data[image]["labels"].extend(call_result.get("labels", []))
361
366
  image_to_data[image]["scores"].extend(call_result.get("scores", []))
362
367
  image_to_data[image]["masks"].extend(call_result.get("masks", []))
368
+ # only single heatmap is returned
369
+ image_to_data[image]["heat_map"].append(call_result.get("heat_map", []))
363
370
  if "mask_shape" in call_result:
364
371
  image_to_data[image]["mask_shape"] = call_result["mask_shape"]
365
372
 
@@ -480,9 +487,14 @@ class VisionAgent(Agent):
480
487
  """Invoke the vision agent.
481
488
 
482
489
  Parameters:
483
- input: a prompt that describe the task or a conversation in the format of
490
+ chat: A conversation in the format of
484
491
  [{"role": "user", "content": "describe your task here..."}].
485
- image: the input image referenced in the prompt parameter.
492
+ image: The input image referenced in the chat parameter.
493
+ reference_data: A dictionary containing the reference image, mask or bounding
494
+ box in the format of:
495
+ {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
496
+ where the bounding box coordinates are normalized.
497
+ visualize_output: Whether to visualize the output.
486
498
 
487
499
  Returns:
488
500
  The result of the vision agent in text.
@@ -509,7 +521,9 @@ class VisionAgent(Agent):
509
521
  self.report_progress_callback("<VIZ>")
510
522
  if images:
511
523
  for img in images:
512
- self.report_progress_callback(f"<IMG>{convert_to_b64(img)}</IMG>")
524
+ self.report_progress_callback(
525
+ f"<IMG>base:64{convert_to_b64(img)}</IMG>"
526
+ )
513
527
  self.report_progress_callback("</VIZ>")
514
528
 
515
529
  def chat_with_workflow(
@@ -522,12 +536,14 @@ class VisionAgent(Agent):
522
536
  """Chat with the vision agent and return the final answer and all tool results.
523
537
 
524
538
  Parameters:
525
- chat: a conversation in the format of
539
+ chat: A conversation in the format of
526
540
  [{"role": "user", "content": "describe your task here..."}].
527
- image: the input image referenced in the chat parameter.
528
- reference_data: a dictionary containing the reference image and mask. in the
529
- format of {"image": "image.jpg", "mask": "mask.jpg}
530
- visualize_output: whether to visualize the output.
541
+ image: The input image referenced in the chat parameter.
542
+ reference_data: A dictionary containing the reference image, mask or bounding
543
+ box in the format of:
544
+ {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
545
+ where the bounding box coordinates are normalized.
546
+ visualize_output: Whether to visualize the output.
531
547
 
532
548
  Returns:
533
549
  A tuple where the first item is the final answer and the second item is a
@@ -211,7 +211,7 @@ def overlay_masks(
211
211
  }
212
212
 
213
213
  for label, mask in zip(masks["labels"], masks["masks"]):
214
- if isinstance(mask, str):
214
+ if isinstance(mask, str) or isinstance(mask, Path):
215
215
  mask = np.array(Image.open(mask))
216
216
  np_mask = np.zeros((image.size[1], image.size[0], 4))
217
217
  np_mask[mask > 0, :] = color[label] + (255 * alpha,)
@@ -221,7 +221,7 @@ def overlay_masks(
221
221
 
222
222
 
223
223
  def overlay_heat_map(
224
- image: Union[str, Path, np.ndarray, ImageType], masks: Dict, alpha: float = 0.8
224
+ image: Union[str, Path, np.ndarray, ImageType], heat_map: Dict, alpha: float = 0.8
225
225
  ) -> ImageType:
226
226
  r"""Plots heat map on to an image.
227
227
 
@@ -238,14 +238,12 @@ def overlay_heat_map(
238
238
  elif isinstance(image, np.ndarray):
239
239
  image = Image.fromarray(image)
240
240
 
241
- if "masks" not in masks:
241
+ if "heat_map" not in heat_map:
242
242
  return image.convert("RGB")
243
243
 
244
- # Only one heat map per image, so no need to loop through masks
245
244
  image = image.convert("L")
246
-
247
- if isinstance(masks["masks"][0], str):
248
- mask = b64_to_pil(masks["masks"][0])
245
+ # Only one heat map per image, so no need to loop through masks
246
+ mask = Image.fromarray(heat_map["heat_map"][0])
249
247
 
250
248
  overlay = Image.new("RGBA", mask.size)
251
249
  odraw = ImageDraw.Draw(overlay)
@@ -11,6 +11,7 @@ from PIL import Image
11
11
  from PIL.Image import Image as ImageType
12
12
 
13
13
  from vision_agent.image_utils import (
14
+ b64_to_pil,
14
15
  convert_to_b64,
15
16
  denormalize_bbox,
16
17
  get_image_size,
@@ -516,7 +517,9 @@ class ZeroShotCounting(Tool):
516
517
  "image": image_b64,
517
518
  "tool": "zero_shot_counting",
518
519
  }
519
- return _send_inference_request(data, "tools")
520
+ resp_data = _send_inference_request(data, "tools")
521
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
522
+ return resp_data
520
523
 
521
524
 
522
525
  class VisualPromptCounting(Tool):
@@ -585,7 +588,9 @@ class VisualPromptCounting(Tool):
585
588
  "prompt": prompt,
586
589
  "tool": "few_shot_counting",
587
590
  }
588
- return _send_inference_request(data, "tools")
591
+ resp_data = _send_inference_request(data, "tools")
592
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
593
+ return resp_data
589
594
 
590
595
 
591
596
  class VisualQuestionAnswering(Tool):
File without changes
File without changes