vision-agent 1.1.13__py3-none-any.whl → 1.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -406,6 +406,29 @@ desc,doc,name
406
406
  [
407
407
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
408
408
  ]",ocr
409
+ "'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: Optional[numpy.ndarray] = None) -> numpy.ndarray:
410
+ 'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
411
+ It can be used to edit parts of an image or the entire image according to the prompt given.
412
+
413
+ Parameters:
414
+ prompt (str): A detailed text description guiding what should be generated
415
+ in the image. More detailed and specific prompts typically yield
416
+ better results.
417
+ image (np.ndarray, optional): The source image to be inpainted. The image will serve as
418
+ the base context for the inpainting process.
419
+
420
+ Returns:
421
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
422
+ ranging from 0 to 255.
423
+
424
+ -------
425
+ Example:
426
+ >>> # Generate inpainting
427
+ >>> result = gemini_image_generation(
428
+ ... prompt=""a modern black leather sofa with white pillows"",
429
+ ... image=image,
430
+ ... )
431
+ >>> save_image(result, ""inpainted_room.png"")",gemini_image_generation
409
432
  'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
410
433
  'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
411
434
  images including regular images or images of documents or presentations. It can be
@@ -439,27 +462,28 @@ desc,doc,name
439
462
  -------
440
463
  >>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
441
464
  'Lionel Messi'",qwen25_vl_video_vqa
442
- 'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen25vl', chunk_length_frames: int = 10) -> List[float]:
443
- 'activity_recognition' is a tool that can recognize activities in a video given a
444
- text prompt. It can be used to identify where specific activities or actions
445
- happen in a video and returns a list of 0s and 1s to indicate the activity.
465
+ "'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video. It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.","agentic_activity_recognition(prompt: str, frames: List[numpy.ndarray], fps: Optional[float] = 5, specificity: str = 'max', with_audio: bool = False) -> List[Dict[str, Any]]:
466
+ 'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video.
467
+ It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.
446
468
 
447
469
  Parameters:
448
- prompt (str): The event you want to identify, should be phrased as a question,
449
- for example, ""Did a goal happen?"".
450
- frames (List[np.ndarray]): The reference frames used for the question
451
- model (str): The model to use for the inference. Valid values are
452
- 'claude-35', 'gpt-4o', 'qwen2vl'.
453
- chunk_length_frames (int): length of each chunk in frames
470
+ prompt (str): The prompt for activity recognition. Multiple activieties can be separated by semi-colon.
471
+ frames (List[np.ndarray]): The list of frames corresponding to the video.
472
+ fps (float, optional): The frame rate per second to extract the frames at. Defaults to 5.
473
+ specificity (str, optional): Specificity or precision level for activity recognition - low, medium, high, max. Default is max.
474
+ with_audio (bool, optional): Whether to include audio processing in activity recognition. Set it to false if there is no audio in the video. Default is false.
454
475
 
455
476
  Returns:
456
- List[float]: A list of floats with a value of 1.0 if the activity is detected in
457
- the chunk_length_frames of the video.
477
+ List[Dict[str, Any]]: A list of dictionaries containing the start time, end time, location, description, and label for each detected activity.
478
+ The start and end times are in seconds, the location is a string, the description is a string, and the label is an integer.
458
479
 
459
480
  Example
460
481
  -------
461
- >>> activity_recognition('Did a goal happened?', frames)
462
- [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",activity_recognition
482
+ >>> agentic_activity_recognition('Person gets on bike; Person gets off bike', frames)
483
+ [
484
+ {'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
485
+ {'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
486
+ ]",agentic_activity_recognition
463
487
  'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
464
488
  'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
465
489
  depth image from a given RGB image. The returned depth image is monochrome and
@@ -514,59 +538,6 @@ desc,doc,name
514
538
  -------
515
539
  >>> vit_nsfw_classification(image)
516
540
  {""label"": ""normal"", ""scores"": 0.68},",vit_nsfw_classification
517
- "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
518
- 'flux_image_inpainting' performs image inpainting to fill the masked regions,
519
- given by mask, in the image, given image based on the text prompt and surrounding
520
- image context. It can be used to edit regions of an image according to the prompt
521
- given.
522
-
523
- Parameters:
524
- prompt (str): A detailed text description guiding what should be generated
525
- in the masked area. More detailed and specific prompts typically yield
526
- better results.
527
- image (np.ndarray): The source image to be inpainted. The image will serve as
528
- the base context for the inpainting process.
529
- mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
530
- areas to be inpainted and 0 indicates areas to be preserved.
531
-
532
- Returns:
533
- np.ndarray: The generated image(s) as a numpy array in RGB format with values
534
- ranging from 0 to 255.
535
-
536
- -------
537
- Example:
538
- >>> # Generate inpainting
539
- >>> result = flux_image_inpainting(
540
- ... prompt=""a modern black leather sofa with white pillows"",
541
- ... image=image,
542
- ... mask=mask,
543
- ... )
544
- >>> save_image(result, ""inpainted_room.png"")
545
- ",flux_image_inpainting
546
- "'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
547
- 'gemini_image_generation' performs image inpainting given an image and text prompt.
548
- It can be used to edit parts of an image or the entire image according to the prompt given.
549
-
550
- Parameters:
551
- prompt (str): A detailed text description guiding what should be generated
552
- in the image. More detailed and specific prompts typically yield
553
- better results.
554
- image (np.ndarray): The source image to be inpainted. The image will serve as
555
- the base context for the inpainting process.
556
-
557
- Returns:
558
- np.ndarray: The generated image(s) as a numpy array in RGB format with values
559
- ranging from 0 to 255.
560
-
561
- -------
562
- Example:
563
- >>> # Generate inpainting
564
- >>> result = gemini_image_generation(
565
- ... prompt="a modern black leather sofa with white pillows",
566
- ... image=image,
567
- ... )
568
- >>> save_image(result, ""inpainted_room.png"")
569
- ",gemini_image_generation
570
541
  'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
571
542
  'siglip_classification' is a tool that can classify an image or a cropped detection given a list
572
543
  of input labels or tags. It returns the same list of the input labels along with
@@ -718,4 +689,4 @@ desc,doc,name
718
689
  [0, 0, 0, ..., 0, 0, 0],
719
690
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
720
691
  }],
721
- )",overlay_segmentation_masks
692
+ )",overlay_segmentation_masks
Binary file
@@ -519,7 +519,7 @@ You are given a task: "{task}" from the user. You must extract the type of categ
519
519
  - "video object tracking" - tracking objects in a video.
520
520
  - "depth and pose estimation" - estimating the depth or pose of objects in an image.
521
521
  - "activity recognition" - identifying time period(s) an event occurs in a video.
522
- - "inpainting" - filling in masked parts of an image.
522
+ - "image generation" - generating images from a text prompt.
523
523
 
524
524
  Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
525
525
  """
@@ -55,7 +55,7 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
55
55
  - Pose estimation
56
56
  - Visual question answering for both images and videos
57
57
  - Activity recognition in videos
58
- - Image inpainting
58
+ - Image generation
59
59
 
60
60
  How can I help you?</response>
61
61
  --- END EXAMPLE2 ---
@@ -7,7 +7,7 @@ from .meta_tools import (
7
7
  from .planner_tools import judge_od_results
8
8
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
9
9
  from .tools import (
10
- activity_recognition,
10
+ agentic_activity_recognition,
11
11
  agentic_document_extraction,
12
12
  agentic_object_detection,
13
13
  agentic_sam2_instance_segmentation,
@@ -30,7 +30,6 @@ from .tools import (
30
30
  florence2_ocr,
31
31
  florence2_sam2_instance_segmentation,
32
32
  florence2_sam2_video_tracking,
33
- flux_image_inpainting,
34
33
  gemini_image_generation,
35
34
  generate_pose_image,
36
35
  get_tools,
@@ -24,7 +24,7 @@ import pymupdf # type: ignore
24
24
  from google import genai # type: ignore
25
25
  from google.genai import types # type: ignore
26
26
 
27
- from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
27
+ from vision_agent.lmm.lmm import AnthropicLMM
28
28
  from vision_agent.utils.execute import FileSerializer, MimeType
29
29
  from vision_agent.utils.image_utils import (
30
30
  b64_to_pil,
@@ -2337,140 +2337,55 @@ Answer the question directly using only the information from the document, do no
2337
2337
  return llm_output
2338
2338
 
2339
2339
 
2340
- def _sample(frames: List[np.ndarray], sample_size: int) -> List[np.ndarray]:
2341
- sample_indices = np.linspace(0, len(frames) - 1, sample_size, dtype=int)
2342
- sampled_frames = []
2343
-
2344
- for i, frame in enumerate(frames):
2345
- if i in sample_indices:
2346
- sampled_frames.append(frame)
2347
- if len(sampled_frames) >= sample_size:
2348
- break
2349
- return sampled_frames
2350
-
2351
-
2352
- def _lmm_activity_recognition(
2353
- lmm: LMM,
2354
- segment: List[np.ndarray],
2355
- prompt: str,
2356
- ) -> List[float]:
2357
- frames = _sample(segment, 10)
2358
- media = []
2359
- for frame in frames:
2360
- buffer = io.BytesIO()
2361
- image_pil = Image.fromarray(frame)
2362
- if image_pil.size[0] > 768:
2363
- image_pil.thumbnail((768, 768))
2364
- image_pil.save(buffer, format="PNG")
2365
- image_bytes = buffer.getvalue()
2366
- image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
2367
- media.append(image_b64)
2368
-
2369
- response = cast(str, lmm.generate(prompt, media))
2370
- if "yes" in response.lower():
2371
- return [1.0] * len(segment)
2372
- return [0.0] * len(segment)
2373
-
2374
-
2375
- def _qwenvl_activity_recognition(
2376
- segment: List[np.ndarray], prompt: str, model_name: str = "qwen2vl"
2377
- ) -> List[float]:
2378
- payload: Dict[str, Any] = {
2379
- "prompt": prompt,
2380
- "model": model_name,
2381
- "function_name": f"{model_name}_vl_video_vqa",
2382
- }
2383
- segment_buffer_bytes = [("video", frames_to_bytes(segment))]
2384
- response = send_inference_request(
2385
- payload, "image-to-text", files=segment_buffer_bytes, v2=True
2386
- )
2387
- if "yes" in response.lower():
2388
- return [1.0] * len(segment)
2389
- return [0.0] * len(segment)
2390
-
2391
-
2392
- def activity_recognition(
2340
+ def agentic_activity_recognition(
2393
2341
  prompt: str,
2394
2342
  frames: List[np.ndarray],
2395
- model: str = "qwen25vl",
2396
- chunk_length_frames: int = 10,
2397
- ) -> List[float]:
2398
- """'activity_recognition' is a tool that can recognize activities in a video given a
2399
- text prompt. It can be used to identify where specific activities or actions
2400
- happen in a video and returns a list of 0s and 1s to indicate the activity.
2343
+ fps: Optional[float] = 5,
2344
+ specificity: str = "max",
2345
+ with_audio: bool = False,
2346
+ ) -> List[Dict[str, Any]]:
2347
+ """'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video.
2348
+ It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.
2401
2349
 
2402
2350
  Parameters:
2403
- prompt (str): The event you want to identify, should be phrased as a question,
2404
- for example, "Did a goal happen?".
2405
- frames (List[np.ndarray]): The reference frames used for the question
2406
- model (str): The model to use for the inference. Valid values are
2407
- 'claude-35', 'gpt-4o', 'qwen2vl'.
2408
- chunk_length_frames (int): length of each chunk in frames
2351
+ prompt (str): The prompt for activity recognition. Multiple activieties can be separated by semi-colon.
2352
+ frames (List[np.ndarray]): The list of frames corresponding to the video.
2353
+ fps (float, optional): The frame rate per second to extract the frames at. Defaults to 5.
2354
+ specificity (str, optional): Specificity or precision level for activity recognition - low, medium, high, max. Default is max.
2355
+ with_audio (bool, optional): Whether to include audio processing in activity recognition. Set it to false if there is no audio in the video. Default is false.
2409
2356
 
2410
2357
  Returns:
2411
- List[float]: A list of floats with a value of 1.0 if the activity is detected in
2412
- the chunk_length_frames of the video.
2358
+ List[Dict[str, Any]]: A list of dictionaries containing the start time, end time, location, description, and label for each detected activity.
2359
+ The start and end times are in seconds, the location is a string, the description is a string, and the label is an integer.
2413
2360
 
2414
2361
  Example
2415
2362
  -------
2416
- >>> activity_recognition('Did a goal happened?', frames)
2417
- [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
2363
+ >>> agentic_activity_recognition('Person gets on bike; Person gets off bike', frames)
2364
+ [
2365
+ {'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
2366
+ {'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
2367
+ ]
2418
2368
  """
2419
-
2420
- buffer_bytes = frames_to_bytes(frames)
2369
+ fps = fps if fps is not None else 5
2370
+ buffer_bytes = frames_to_bytes(frames, fps=fps)
2421
2371
  files = [("video", buffer_bytes)]
2422
2372
 
2423
- segments = split_frames_into_segments(
2424
- frames, segment_size=chunk_length_frames, overlap=0
2425
- )
2373
+ payload = {"prompt": prompt, "specificity": specificity, "with_audio": with_audio}
2426
2374
 
2427
- prompt = (
2428
- f"{prompt} Please respond with a 'yes' or 'no' based on the frames provided."
2375
+ response = send_inference_request(
2376
+ payload=payload, endpoint_name="activity-recognition", files=files, v2=True
2429
2377
  )
2430
2378
 
2431
- if model == "claude-35":
2432
-
2433
- def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2434
- return _lmm_activity_recognition(AnthropicLMM(), segment, prompt)
2435
-
2436
- elif model == "gpt-4o":
2437
-
2438
- def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2439
- return _lmm_activity_recognition(OpenAILMM(), segment, prompt)
2440
-
2441
- elif model == "qwen2vl":
2442
-
2443
- def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2444
- return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
2445
-
2446
- elif model == "qwen25vl":
2447
-
2448
- def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2449
- return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
2450
-
2451
- else:
2452
- raise ValueError(f"Invalid model: {model}")
2453
-
2454
- with ThreadPoolExecutor() as executor:
2455
- futures = {
2456
- executor.submit(_apply_activity_recognition, segment): segment_index
2457
- for segment_index, segment in enumerate(segments)
2458
- }
2459
-
2460
- return_value_tuples = []
2461
- for future in as_completed(futures):
2462
- segment_index = futures[future]
2463
- return_value_tuples.append((segment_index, future.result()))
2464
- return_values = [x[1] for x in sorted(return_value_tuples, key=lambda x: x[0])]
2465
- return_values_flattened = cast(List[float], [e for o in return_values for e in o])
2466
-
2467
2379
  _display_tool_trace(
2468
- activity_recognition.__name__,
2469
- {"prompt": prompt, "model": model},
2470
- return_values,
2380
+ agentic_activity_recognition.__name__,
2381
+ {"prompt": prompt, "specificity": specificity, "with_audio": with_audio},
2382
+ response,
2471
2383
  files,
2472
2384
  )
2473
- return return_values_flattened
2385
+
2386
+ events: List[Dict[str, Any]] = response["events"]
2387
+
2388
+ return events
2474
2389
 
2475
2390
 
2476
2391
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
@@ -2751,104 +2666,6 @@ def template_match(
2751
2666
  return return_data
2752
2667
 
2753
2668
 
2754
- def flux_image_inpainting(
2755
- prompt: str,
2756
- image: np.ndarray,
2757
- mask: np.ndarray,
2758
- ) -> np.ndarray:
2759
- """'flux_image_inpainting' performs image inpainting to fill the masked regions,
2760
- given by mask, in the image, given image based on the text prompt and surrounding
2761
- image context. It can be used to edit regions of an image according to the prompt
2762
- given.
2763
-
2764
- Parameters:
2765
- prompt (str): A detailed text description guiding what should be generated
2766
- in the masked area. More detailed and specific prompts typically yield
2767
- better results.
2768
- image (np.ndarray): The source image to be inpainted. The image will serve as
2769
- the base context for the inpainting process.
2770
- mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
2771
- areas to be inpainted and 0 indicates areas to be preserved.
2772
-
2773
- Returns:
2774
- np.ndarray: The generated image(s) as a numpy array in RGB format with values
2775
- ranging from 0 to 255.
2776
-
2777
- -------
2778
- Example:
2779
- >>> # Generate inpainting
2780
- >>> result = flux_image_inpainting(
2781
- ... prompt="a modern black leather sofa with white pillows",
2782
- ... image=image,
2783
- ... mask=mask,
2784
- ... )
2785
- >>> save_image(result, "inpainted_room.png")
2786
- """
2787
-
2788
- min_dim = 8
2789
-
2790
- if any(dim < min_dim for dim in image.shape[:2] + mask.shape[:2]):
2791
- raise ValueError(f"Image and mask must be at least {min_dim}x{min_dim} pixels")
2792
-
2793
- max_size = (512, 512)
2794
-
2795
- if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
2796
- scaling_factor = min(max_size[0] / image.shape[0], max_size[1] / image.shape[1])
2797
- new_size = (
2798
- int(image.shape[1] * scaling_factor),
2799
- int(image.shape[0] * scaling_factor),
2800
- )
2801
- new_size = ((new_size[0] // 8) * 8, (new_size[1] // 8) * 8)
2802
- image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
2803
- mask = cv2.resize(mask, new_size, interpolation=cv2.INTER_NEAREST)
2804
-
2805
- elif image.shape[0] % 8 != 0 or image.shape[1] % 8 != 0:
2806
- new_size = ((image.shape[1] // 8) * 8, (image.shape[0] // 8) * 8)
2807
- image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
2808
- mask = cv2.resize(mask, new_size, interpolation=cv2.INTER_NEAREST)
2809
-
2810
- if np.array_equal(mask, mask.astype(bool).astype(int)):
2811
- mask = np.where(mask > 0, 255, 0).astype(np.uint8)
2812
- else:
2813
- raise ValueError("Mask should contain only binary values (0 or 1)")
2814
-
2815
- image_file = numpy_to_bytes(image)
2816
- mask_file = numpy_to_bytes(mask)
2817
-
2818
- files = [
2819
- ("image", image_file),
2820
- ("mask_image", mask_file),
2821
- ]
2822
-
2823
- payload = {
2824
- "prompt": prompt,
2825
- "task": "inpainting",
2826
- "height": image.shape[0],
2827
- "width": image.shape[1],
2828
- "strength": 0.99,
2829
- "guidance_scale": 18,
2830
- "num_inference_steps": 20,
2831
- "seed": None,
2832
- }
2833
-
2834
- response = send_inference_request(
2835
- payload=payload,
2836
- endpoint_name="flux1",
2837
- files=files,
2838
- v2=True,
2839
- metadata_payload={"function_name": "flux_image_inpainting"},
2840
- )
2841
-
2842
- output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
2843
- _display_tool_trace(
2844
- flux_image_inpainting.__name__,
2845
- payload,
2846
- output_image,
2847
- files,
2848
- )
2849
- return output_image
2850
-
2851
-
2852
2669
  def gemini_image_generation(
2853
2670
  prompt: str,
2854
2671
  image: Optional[np.ndarray] = None,
@@ -2894,24 +2711,18 @@ def gemini_image_generation(
2894
2711
  ),
2895
2712
  )
2896
2713
 
2897
- if (
2898
- not resp.candidates
2899
- or not resp.candidates[0].content
2900
- or not resp.candidates[0].content.parts
2901
- or not resp.candidates[0].content.parts[0].inline_data
2902
- or not resp.candidates[0].content.parts[0].inline_data.data
2903
- ):
2714
+ if not resp.candidates or not resp.candidates[0].content:
2904
2715
  _LOGGER.warning(f"Attempt {attempt + 1}: No candidates returned")
2905
2716
  time.sleep(5)
2906
2717
  continue
2907
- else:
2908
- return (
2909
- resp.candidates[0].content.parts[0].inline_data.data
2910
- if isinstance(
2911
- resp.candidates[0].content.parts[0].inline_data.data, bytes
2912
- )
2913
- else None
2914
- )
2718
+
2719
+ for part in resp.candidates[0].content.parts:
2720
+ if (
2721
+ hasattr(part, "inline_data")
2722
+ and part.inline_data
2723
+ and isinstance(data := part.inline_data.data, bytes)
2724
+ ):
2725
+ return data
2915
2726
 
2916
2727
  except genai.errors.ClientError as e:
2917
2728
  _LOGGER.warning(f"Attempt {attempt + 1} failed: {str(e)}")
@@ -2932,8 +2743,6 @@ def gemini_image_generation(
2932
2743
  )
2933
2744
  image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
2934
2745
 
2935
- # Convert to RGB
2936
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
2937
2746
  image_file = numpy_to_bytes(image)
2938
2747
  files = [("image", image_file)]
2939
2748
 
@@ -3756,13 +3565,13 @@ FUNCTION_TOOLS = [
3756
3565
  agentic_document_extraction,
3757
3566
  document_qa,
3758
3567
  ocr,
3568
+ gemini_image_generation,
3759
3569
  qwen25_vl_images_vqa,
3760
3570
  qwen25_vl_video_vqa,
3761
- activity_recognition,
3571
+ agentic_activity_recognition,
3762
3572
  depth_anything_v2,
3763
3573
  generate_pose_image,
3764
3574
  vit_nsfw_classification,
3765
- flux_image_inpainting,
3766
3575
  siglip_classification,
3767
3576
  minimum_distance,
3768
3577
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.13
3
+ Version: 1.1.15
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -1,14 +1,14 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/.sim_tools/df.csv,sha256=fLh8HN76ezbOXZUoZbnkhNi5vvjYif2jSblHtRdY8dY,41875
3
- vision_agent/.sim_tools/embs.npy,sha256=uUPZ6QuCAr8JAtFa1L9ndAag5ycptIeJ2I8P9U8Y6YY,245888
2
+ vision_agent/.sim_tools/df.csv,sha256=i732_U1KQf55UNhT-9srtZXF91XvDnfWBDdc8EqDmpw,41215
3
+ vision_agent/.sim_tools/embs.npy,sha256=XCu3LnLS10IS3npfPMqX2VHIbDPq9iY_NPDBwq5AEj0,245888
4
4
  vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
5
5
  vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
6
6
  vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
7
7
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
8
8
  vision_agent/agent/vision_agent_coder_v2.py,sha256=ELc_J8Q4NKPs7YETu3a9O0Vk1zN3k6QfHBgu0M0IWGk,17450
9
- vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=YARVphHKLMNUqCeOsrManvgecl77RP1g51vtt7JpdWk,35937
9
+ vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=O24BpRhMRZx7D_WdaRv-a2K6fLpin0o7oWxlvL70WpM,35944
10
10
  vision_agent/agent/vision_agent_planner_v2.py,sha256=Aww_BJhTFKZ5XjYe8FW57z2Gwp2se0vg1t1DKLGRAyQ,22050
11
- vision_agent/agent/vision_agent_prompts_v2.py,sha256=6l0o6yAEcaTBOxkHPNJcdV2wkLpoMIiB_9ZqgL2qo2k,4231
11
+ vision_agent/agent/vision_agent_prompts_v2.py,sha256=NG1xnZvZGi4DcqdfqZCkPkS7oka3gr6h42ekUKUKcqY,4231
12
12
  vision_agent/agent/vision_agent_v2.py,sha256=iPW6DowH7wCFIA5vb1SdSLfZFWbn_oSC7Xa8uO8KIJI,11675
13
13
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
26
26
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
27
27
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
28
28
  vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
29
- vision_agent/tools/__init__.py,sha256=PRUka2eqHwPWJxwfpLj-O2Ab7hXG_dsE1Aov3TE6teM,2496
29
+ vision_agent/tools/__init__.py,sha256=zf8HzjcMSgxKhtrxbqYe9hmvsfuweeDMrOc8eVA8Ya8,2477
30
30
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
31
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=A1YpJuarR1P9ZLnCuakxLiUUtYsnlrvfwlUrkBey_FU,130803
33
+ vision_agent/tools/tools.py,sha256=i9GGGu8tvo2M6O5fF4UUBTpn_Ul2KEN9mG3ZlJ95qao,124929
34
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
36
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
40
40
  vision_agent/utils/tools_doc.py,sha256=PKcXXbJktiuPi9q6Q1zXzFx24Dh229SNgWBDtZ2fQSQ,2730
41
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.1.13.dist-info/METADATA,sha256=1LVRyxXfxT_eGGfpgK5fioWESB6FWx4LDm_xylNpZdY,12673
44
- vision_agent-1.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
- vision_agent-1.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- vision_agent-1.1.13.dist-info/RECORD,,
43
+ vision_agent-1.1.15.dist-info/METADATA,sha256=EkYUNPMuq2WuDoBFVhKMT9H06z7-wzjWjV4EQGeIf8E,12673
44
+ vision_agent-1.1.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ vision_agent-1.1.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
+ vision_agent-1.1.15.dist-info/RECORD,,