vision-agent 1.1.14__py3-none-any.whl → 1.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +39 -68
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
- vision_agent/agent/vision_agent_prompts_v2.py +1 -1
- vision_agent/tools/__init__.py +1 -2
- vision_agent/tools/tools.py +43 -234
- {vision_agent-1.1.14.dist-info → vision_agent-1.1.15.dist-info}/METADATA +1 -1
- {vision_agent-1.1.14.dist-info → vision_agent-1.1.15.dist-info}/RECORD +10 -10
- {vision_agent-1.1.14.dist-info → vision_agent-1.1.15.dist-info}/WHEEL +0 -0
- {vision_agent-1.1.14.dist-info → vision_agent-1.1.15.dist-info}/licenses/LICENSE +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -406,6 +406,29 @@ desc,doc,name
|
|
406
406
|
[
|
407
407
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
408
408
|
]",ocr
|
409
|
+
"'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: Optional[numpy.ndarray] = None) -> numpy.ndarray:
|
410
|
+
'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
|
411
|
+
It can be used to edit parts of an image or the entire image according to the prompt given.
|
412
|
+
|
413
|
+
Parameters:
|
414
|
+
prompt (str): A detailed text description guiding what should be generated
|
415
|
+
in the image. More detailed and specific prompts typically yield
|
416
|
+
better results.
|
417
|
+
image (np.ndarray, optional): The source image to be inpainted. The image will serve as
|
418
|
+
the base context for the inpainting process.
|
419
|
+
|
420
|
+
Returns:
|
421
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
422
|
+
ranging from 0 to 255.
|
423
|
+
|
424
|
+
-------
|
425
|
+
Example:
|
426
|
+
>>> # Generate inpainting
|
427
|
+
>>> result = gemini_image_generation(
|
428
|
+
... prompt=""a modern black leather sofa with white pillows"",
|
429
|
+
... image=image,
|
430
|
+
... )
|
431
|
+
>>> save_image(result, ""inpainted_room.png"")",gemini_image_generation
|
409
432
|
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
410
433
|
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
411
434
|
images including regular images or images of documents or presentations. It can be
|
@@ -439,27 +462,28 @@ desc,doc,name
|
|
439
462
|
-------
|
440
463
|
>>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
|
441
464
|
'Lionel Messi'",qwen25_vl_video_vqa
|
442
|
-
'
|
443
|
-
'
|
444
|
-
|
445
|
-
happen in a video and returns a list of 0s and 1s to indicate the activity.
|
465
|
+
"'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video. It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.","agentic_activity_recognition(prompt: str, frames: List[numpy.ndarray], fps: Optional[float] = 5, specificity: str = 'max', with_audio: bool = False) -> List[Dict[str, Any]]:
|
466
|
+
'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video.
|
467
|
+
It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.
|
446
468
|
|
447
469
|
Parameters:
|
448
|
-
prompt (str): The
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
chunk_length_frames (int): length of each chunk in frames
|
470
|
+
prompt (str): The prompt for activity recognition. Multiple activieties can be separated by semi-colon.
|
471
|
+
frames (List[np.ndarray]): The list of frames corresponding to the video.
|
472
|
+
fps (float, optional): The frame rate per second to extract the frames at. Defaults to 5.
|
473
|
+
specificity (str, optional): Specificity or precision level for activity recognition - low, medium, high, max. Default is max.
|
474
|
+
with_audio (bool, optional): Whether to include audio processing in activity recognition. Set it to false if there is no audio in the video. Default is false.
|
454
475
|
|
455
476
|
Returns:
|
456
|
-
List[
|
457
|
-
the
|
477
|
+
List[Dict[str, Any]]: A list of dictionaries containing the start time, end time, location, description, and label for each detected activity.
|
478
|
+
The start and end times are in seconds, the location is a string, the description is a string, and the label is an integer.
|
458
479
|
|
459
480
|
Example
|
460
481
|
-------
|
461
|
-
>>>
|
462
|
-
[
|
482
|
+
>>> agentic_activity_recognition('Person gets on bike; Person gets off bike', frames)
|
483
|
+
[
|
484
|
+
{'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
|
485
|
+
{'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
|
486
|
+
]",agentic_activity_recognition
|
463
487
|
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
464
488
|
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
|
465
489
|
depth image from a given RGB image. The returned depth image is monochrome and
|
@@ -514,59 +538,6 @@ desc,doc,name
|
|
514
538
|
-------
|
515
539
|
>>> vit_nsfw_classification(image)
|
516
540
|
{""label"": ""normal"", ""scores"": 0.68},",vit_nsfw_classification
|
517
|
-
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
518
|
-
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
519
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
520
|
-
image context. It can be used to edit regions of an image according to the prompt
|
521
|
-
given.
|
522
|
-
|
523
|
-
Parameters:
|
524
|
-
prompt (str): A detailed text description guiding what should be generated
|
525
|
-
in the masked area. More detailed and specific prompts typically yield
|
526
|
-
better results.
|
527
|
-
image (np.ndarray): The source image to be inpainted. The image will serve as
|
528
|
-
the base context for the inpainting process.
|
529
|
-
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
530
|
-
areas to be inpainted and 0 indicates areas to be preserved.
|
531
|
-
|
532
|
-
Returns:
|
533
|
-
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
534
|
-
ranging from 0 to 255.
|
535
|
-
|
536
|
-
-------
|
537
|
-
Example:
|
538
|
-
>>> # Generate inpainting
|
539
|
-
>>> result = flux_image_inpainting(
|
540
|
-
... prompt=""a modern black leather sofa with white pillows"",
|
541
|
-
... image=image,
|
542
|
-
... mask=mask,
|
543
|
-
... )
|
544
|
-
>>> save_image(result, ""inpainted_room.png"")
|
545
|
-
",flux_image_inpainting
|
546
|
-
"'gemini_image_generation' performs image inpainting given an image and text prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: numpy.ndarray) -> numpy.ndarray:
|
547
|
-
'gemini_image_generation' performs image inpainting given an image and text prompt.
|
548
|
-
It can be used to edit parts of an image or the entire image according to the prompt given.
|
549
|
-
|
550
|
-
Parameters:
|
551
|
-
prompt (str): A detailed text description guiding what should be generated
|
552
|
-
in the image. More detailed and specific prompts typically yield
|
553
|
-
better results.
|
554
|
-
image (np.ndarray): The source image to be inpainted. The image will serve as
|
555
|
-
the base context for the inpainting process.
|
556
|
-
|
557
|
-
Returns:
|
558
|
-
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
559
|
-
ranging from 0 to 255.
|
560
|
-
|
561
|
-
-------
|
562
|
-
Example:
|
563
|
-
>>> # Generate inpainting
|
564
|
-
>>> result = gemini_image_generation(
|
565
|
-
... prompt="a modern black leather sofa with white pillows",
|
566
|
-
... image=image,
|
567
|
-
... )
|
568
|
-
>>> save_image(result, ""inpainted_room.png"")
|
569
|
-
",gemini_image_generation
|
570
541
|
'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
|
571
542
|
'siglip_classification' is a tool that can classify an image or a cropped detection given a list
|
572
543
|
of input labels or tags. It returns the same list of the input labels along with
|
@@ -718,4 +689,4 @@ desc,doc,name
|
|
718
689
|
[0, 0, 0, ..., 0, 0, 0],
|
719
690
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
720
691
|
}],
|
721
|
-
)",overlay_segmentation_masks
|
692
|
+
)",overlay_segmentation_masks
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
@@ -519,7 +519,7 @@ You are given a task: "{task}" from the user. You must extract the type of categ
|
|
519
519
|
- "video object tracking" - tracking objects in a video.
|
520
520
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
521
521
|
- "activity recognition" - identifying time period(s) an event occurs in a video.
|
522
|
-
- "
|
522
|
+
- "image generation" - generating images from a text prompt.
|
523
523
|
|
524
524
|
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
|
525
525
|
"""
|
@@ -55,7 +55,7 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
|
|
55
55
|
- Pose estimation
|
56
56
|
- Visual question answering for both images and videos
|
57
57
|
- Activity recognition in videos
|
58
|
-
- Image
|
58
|
+
- Image generation
|
59
59
|
|
60
60
|
How can I help you?</response>
|
61
61
|
--- END EXAMPLE2 ---
|
vision_agent/tools/__init__.py
CHANGED
@@ -7,7 +7,7 @@ from .meta_tools import (
|
|
7
7
|
from .planner_tools import judge_od_results
|
8
8
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
9
9
|
from .tools import (
|
10
|
-
|
10
|
+
agentic_activity_recognition,
|
11
11
|
agentic_document_extraction,
|
12
12
|
agentic_object_detection,
|
13
13
|
agentic_sam2_instance_segmentation,
|
@@ -30,7 +30,6 @@ from .tools import (
|
|
30
30
|
florence2_ocr,
|
31
31
|
florence2_sam2_instance_segmentation,
|
32
32
|
florence2_sam2_video_tracking,
|
33
|
-
flux_image_inpainting,
|
34
33
|
gemini_image_generation,
|
35
34
|
generate_pose_image,
|
36
35
|
get_tools,
|
vision_agent/tools/tools.py
CHANGED
@@ -24,7 +24,7 @@ import pymupdf # type: ignore
|
|
24
24
|
from google import genai # type: ignore
|
25
25
|
from google.genai import types # type: ignore
|
26
26
|
|
27
|
-
from vision_agent.lmm.lmm import
|
27
|
+
from vision_agent.lmm.lmm import AnthropicLMM
|
28
28
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
29
29
|
from vision_agent.utils.image_utils import (
|
30
30
|
b64_to_pil,
|
@@ -2337,140 +2337,55 @@ Answer the question directly using only the information from the document, do no
|
|
2337
2337
|
return llm_output
|
2338
2338
|
|
2339
2339
|
|
2340
|
-
def
|
2341
|
-
sample_indices = np.linspace(0, len(frames) - 1, sample_size, dtype=int)
|
2342
|
-
sampled_frames = []
|
2343
|
-
|
2344
|
-
for i, frame in enumerate(frames):
|
2345
|
-
if i in sample_indices:
|
2346
|
-
sampled_frames.append(frame)
|
2347
|
-
if len(sampled_frames) >= sample_size:
|
2348
|
-
break
|
2349
|
-
return sampled_frames
|
2350
|
-
|
2351
|
-
|
2352
|
-
def _lmm_activity_recognition(
|
2353
|
-
lmm: LMM,
|
2354
|
-
segment: List[np.ndarray],
|
2355
|
-
prompt: str,
|
2356
|
-
) -> List[float]:
|
2357
|
-
frames = _sample(segment, 10)
|
2358
|
-
media = []
|
2359
|
-
for frame in frames:
|
2360
|
-
buffer = io.BytesIO()
|
2361
|
-
image_pil = Image.fromarray(frame)
|
2362
|
-
if image_pil.size[0] > 768:
|
2363
|
-
image_pil.thumbnail((768, 768))
|
2364
|
-
image_pil.save(buffer, format="PNG")
|
2365
|
-
image_bytes = buffer.getvalue()
|
2366
|
-
image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes)
|
2367
|
-
media.append(image_b64)
|
2368
|
-
|
2369
|
-
response = cast(str, lmm.generate(prompt, media))
|
2370
|
-
if "yes" in response.lower():
|
2371
|
-
return [1.0] * len(segment)
|
2372
|
-
return [0.0] * len(segment)
|
2373
|
-
|
2374
|
-
|
2375
|
-
def _qwenvl_activity_recognition(
|
2376
|
-
segment: List[np.ndarray], prompt: str, model_name: str = "qwen2vl"
|
2377
|
-
) -> List[float]:
|
2378
|
-
payload: Dict[str, Any] = {
|
2379
|
-
"prompt": prompt,
|
2380
|
-
"model": model_name,
|
2381
|
-
"function_name": f"{model_name}_vl_video_vqa",
|
2382
|
-
}
|
2383
|
-
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
2384
|
-
response = send_inference_request(
|
2385
|
-
payload, "image-to-text", files=segment_buffer_bytes, v2=True
|
2386
|
-
)
|
2387
|
-
if "yes" in response.lower():
|
2388
|
-
return [1.0] * len(segment)
|
2389
|
-
return [0.0] * len(segment)
|
2390
|
-
|
2391
|
-
|
2392
|
-
def activity_recognition(
|
2340
|
+
def agentic_activity_recognition(
|
2393
2341
|
prompt: str,
|
2394
2342
|
frames: List[np.ndarray],
|
2395
|
-
|
2396
|
-
|
2397
|
-
|
2398
|
-
|
2399
|
-
|
2400
|
-
happen in a video
|
2343
|
+
fps: Optional[float] = 5,
|
2344
|
+
specificity: str = "max",
|
2345
|
+
with_audio: bool = False,
|
2346
|
+
) -> List[Dict[str, Any]]:
|
2347
|
+
"""'agentic_activity_recognition' is a tool that allows you to detect multiple activities within a video.
|
2348
|
+
It can be used to identify when specific activities or actions happen in a video, along with a description of the activity.
|
2401
2349
|
|
2402
2350
|
Parameters:
|
2403
|
-
prompt (str): The
|
2404
|
-
|
2405
|
-
|
2406
|
-
|
2407
|
-
|
2408
|
-
chunk_length_frames (int): length of each chunk in frames
|
2351
|
+
prompt (str): The prompt for activity recognition. Multiple activieties can be separated by semi-colon.
|
2352
|
+
frames (List[np.ndarray]): The list of frames corresponding to the video.
|
2353
|
+
fps (float, optional): The frame rate per second to extract the frames at. Defaults to 5.
|
2354
|
+
specificity (str, optional): Specificity or precision level for activity recognition - low, medium, high, max. Default is max.
|
2355
|
+
with_audio (bool, optional): Whether to include audio processing in activity recognition. Set it to false if there is no audio in the video. Default is false.
|
2409
2356
|
|
2410
2357
|
Returns:
|
2411
|
-
List[
|
2412
|
-
the
|
2358
|
+
List[Dict[str, Any]]: A list of dictionaries containing the start time, end time, location, description, and label for each detected activity.
|
2359
|
+
The start and end times are in seconds, the location is a string, the description is a string, and the label is an integer.
|
2413
2360
|
|
2414
2361
|
Example
|
2415
2362
|
-------
|
2416
|
-
>>>
|
2417
|
-
[
|
2363
|
+
>>> agentic_activity_recognition('Person gets on bike; Person gets off bike', frames)
|
2364
|
+
[
|
2365
|
+
{'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
|
2366
|
+
{'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
|
2367
|
+
]
|
2418
2368
|
"""
|
2419
|
-
|
2420
|
-
buffer_bytes = frames_to_bytes(frames)
|
2369
|
+
fps = fps if fps is not None else 5
|
2370
|
+
buffer_bytes = frames_to_bytes(frames, fps=fps)
|
2421
2371
|
files = [("video", buffer_bytes)]
|
2422
2372
|
|
2423
|
-
|
2424
|
-
frames, segment_size=chunk_length_frames, overlap=0
|
2425
|
-
)
|
2373
|
+
payload = {"prompt": prompt, "specificity": specificity, "with_audio": with_audio}
|
2426
2374
|
|
2427
|
-
|
2428
|
-
|
2375
|
+
response = send_inference_request(
|
2376
|
+
payload=payload, endpoint_name="activity-recognition", files=files, v2=True
|
2429
2377
|
)
|
2430
2378
|
|
2431
|
-
if model == "claude-35":
|
2432
|
-
|
2433
|
-
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2434
|
-
return _lmm_activity_recognition(AnthropicLMM(), segment, prompt)
|
2435
|
-
|
2436
|
-
elif model == "gpt-4o":
|
2437
|
-
|
2438
|
-
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2439
|
-
return _lmm_activity_recognition(OpenAILMM(), segment, prompt)
|
2440
|
-
|
2441
|
-
elif model == "qwen2vl":
|
2442
|
-
|
2443
|
-
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2444
|
-
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
|
2445
|
-
|
2446
|
-
elif model == "qwen25vl":
|
2447
|
-
|
2448
|
-
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2449
|
-
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
|
2450
|
-
|
2451
|
-
else:
|
2452
|
-
raise ValueError(f"Invalid model: {model}")
|
2453
|
-
|
2454
|
-
with ThreadPoolExecutor() as executor:
|
2455
|
-
futures = {
|
2456
|
-
executor.submit(_apply_activity_recognition, segment): segment_index
|
2457
|
-
for segment_index, segment in enumerate(segments)
|
2458
|
-
}
|
2459
|
-
|
2460
|
-
return_value_tuples = []
|
2461
|
-
for future in as_completed(futures):
|
2462
|
-
segment_index = futures[future]
|
2463
|
-
return_value_tuples.append((segment_index, future.result()))
|
2464
|
-
return_values = [x[1] for x in sorted(return_value_tuples, key=lambda x: x[0])]
|
2465
|
-
return_values_flattened = cast(List[float], [e for o in return_values for e in o])
|
2466
|
-
|
2467
2379
|
_display_tool_trace(
|
2468
|
-
|
2469
|
-
{"prompt": prompt, "
|
2470
|
-
|
2380
|
+
agentic_activity_recognition.__name__,
|
2381
|
+
{"prompt": prompt, "specificity": specificity, "with_audio": with_audio},
|
2382
|
+
response,
|
2471
2383
|
files,
|
2472
2384
|
)
|
2473
|
-
|
2385
|
+
|
2386
|
+
events: List[Dict[str, Any]] = response["events"]
|
2387
|
+
|
2388
|
+
return events
|
2474
2389
|
|
2475
2390
|
|
2476
2391
|
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
@@ -2751,104 +2666,6 @@ def template_match(
|
|
2751
2666
|
return return_data
|
2752
2667
|
|
2753
2668
|
|
2754
|
-
def flux_image_inpainting(
|
2755
|
-
prompt: str,
|
2756
|
-
image: np.ndarray,
|
2757
|
-
mask: np.ndarray,
|
2758
|
-
) -> np.ndarray:
|
2759
|
-
"""'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
2760
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
2761
|
-
image context. It can be used to edit regions of an image according to the prompt
|
2762
|
-
given.
|
2763
|
-
|
2764
|
-
Parameters:
|
2765
|
-
prompt (str): A detailed text description guiding what should be generated
|
2766
|
-
in the masked area. More detailed and specific prompts typically yield
|
2767
|
-
better results.
|
2768
|
-
image (np.ndarray): The source image to be inpainted. The image will serve as
|
2769
|
-
the base context for the inpainting process.
|
2770
|
-
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
2771
|
-
areas to be inpainted and 0 indicates areas to be preserved.
|
2772
|
-
|
2773
|
-
Returns:
|
2774
|
-
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
2775
|
-
ranging from 0 to 255.
|
2776
|
-
|
2777
|
-
-------
|
2778
|
-
Example:
|
2779
|
-
>>> # Generate inpainting
|
2780
|
-
>>> result = flux_image_inpainting(
|
2781
|
-
... prompt="a modern black leather sofa with white pillows",
|
2782
|
-
... image=image,
|
2783
|
-
... mask=mask,
|
2784
|
-
... )
|
2785
|
-
>>> save_image(result, "inpainted_room.png")
|
2786
|
-
"""
|
2787
|
-
|
2788
|
-
min_dim = 8
|
2789
|
-
|
2790
|
-
if any(dim < min_dim for dim in image.shape[:2] + mask.shape[:2]):
|
2791
|
-
raise ValueError(f"Image and mask must be at least {min_dim}x{min_dim} pixels")
|
2792
|
-
|
2793
|
-
max_size = (512, 512)
|
2794
|
-
|
2795
|
-
if image.shape[0] > max_size[0] or image.shape[1] > max_size[1]:
|
2796
|
-
scaling_factor = min(max_size[0] / image.shape[0], max_size[1] / image.shape[1])
|
2797
|
-
new_size = (
|
2798
|
-
int(image.shape[1] * scaling_factor),
|
2799
|
-
int(image.shape[0] * scaling_factor),
|
2800
|
-
)
|
2801
|
-
new_size = ((new_size[0] // 8) * 8, (new_size[1] // 8) * 8)
|
2802
|
-
image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
|
2803
|
-
mask = cv2.resize(mask, new_size, interpolation=cv2.INTER_NEAREST)
|
2804
|
-
|
2805
|
-
elif image.shape[0] % 8 != 0 or image.shape[1] % 8 != 0:
|
2806
|
-
new_size = ((image.shape[1] // 8) * 8, (image.shape[0] // 8) * 8)
|
2807
|
-
image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
|
2808
|
-
mask = cv2.resize(mask, new_size, interpolation=cv2.INTER_NEAREST)
|
2809
|
-
|
2810
|
-
if np.array_equal(mask, mask.astype(bool).astype(int)):
|
2811
|
-
mask = np.where(mask > 0, 255, 0).astype(np.uint8)
|
2812
|
-
else:
|
2813
|
-
raise ValueError("Mask should contain only binary values (0 or 1)")
|
2814
|
-
|
2815
|
-
image_file = numpy_to_bytes(image)
|
2816
|
-
mask_file = numpy_to_bytes(mask)
|
2817
|
-
|
2818
|
-
files = [
|
2819
|
-
("image", image_file),
|
2820
|
-
("mask_image", mask_file),
|
2821
|
-
]
|
2822
|
-
|
2823
|
-
payload = {
|
2824
|
-
"prompt": prompt,
|
2825
|
-
"task": "inpainting",
|
2826
|
-
"height": image.shape[0],
|
2827
|
-
"width": image.shape[1],
|
2828
|
-
"strength": 0.99,
|
2829
|
-
"guidance_scale": 18,
|
2830
|
-
"num_inference_steps": 20,
|
2831
|
-
"seed": None,
|
2832
|
-
}
|
2833
|
-
|
2834
|
-
response = send_inference_request(
|
2835
|
-
payload=payload,
|
2836
|
-
endpoint_name="flux1",
|
2837
|
-
files=files,
|
2838
|
-
v2=True,
|
2839
|
-
metadata_payload={"function_name": "flux_image_inpainting"},
|
2840
|
-
)
|
2841
|
-
|
2842
|
-
output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
|
2843
|
-
_display_tool_trace(
|
2844
|
-
flux_image_inpainting.__name__,
|
2845
|
-
payload,
|
2846
|
-
output_image,
|
2847
|
-
files,
|
2848
|
-
)
|
2849
|
-
return output_image
|
2850
|
-
|
2851
|
-
|
2852
2669
|
def gemini_image_generation(
|
2853
2670
|
prompt: str,
|
2854
2671
|
image: Optional[np.ndarray] = None,
|
@@ -2894,24 +2711,18 @@ def gemini_image_generation(
|
|
2894
2711
|
),
|
2895
2712
|
)
|
2896
2713
|
|
2897
|
-
if
|
2898
|
-
not resp.candidates
|
2899
|
-
or not resp.candidates[0].content
|
2900
|
-
or not resp.candidates[0].content.parts
|
2901
|
-
or not resp.candidates[0].content.parts[0].inline_data
|
2902
|
-
or not resp.candidates[0].content.parts[0].inline_data.data
|
2903
|
-
):
|
2714
|
+
if not resp.candidates or not resp.candidates[0].content:
|
2904
2715
|
_LOGGER.warning(f"Attempt {attempt + 1}: No candidates returned")
|
2905
2716
|
time.sleep(5)
|
2906
2717
|
continue
|
2907
|
-
|
2908
|
-
|
2909
|
-
|
2910
|
-
|
2911
|
-
|
2912
|
-
)
|
2913
|
-
|
2914
|
-
|
2718
|
+
|
2719
|
+
for part in resp.candidates[0].content.parts:
|
2720
|
+
if (
|
2721
|
+
hasattr(part, "inline_data")
|
2722
|
+
and part.inline_data
|
2723
|
+
and isinstance(data := part.inline_data.data, bytes)
|
2724
|
+
):
|
2725
|
+
return data
|
2915
2726
|
|
2916
2727
|
except genai.errors.ClientError as e:
|
2917
2728
|
_LOGGER.warning(f"Attempt {attempt + 1} failed: {str(e)}")
|
@@ -2932,8 +2743,6 @@ def gemini_image_generation(
|
|
2932
2743
|
)
|
2933
2744
|
image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
|
2934
2745
|
|
2935
|
-
# Convert to RGB
|
2936
|
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
2937
2746
|
image_file = numpy_to_bytes(image)
|
2938
2747
|
files = [("image", image_file)]
|
2939
2748
|
|
@@ -3756,13 +3565,13 @@ FUNCTION_TOOLS = [
|
|
3756
3565
|
agentic_document_extraction,
|
3757
3566
|
document_qa,
|
3758
3567
|
ocr,
|
3568
|
+
gemini_image_generation,
|
3759
3569
|
qwen25_vl_images_vqa,
|
3760
3570
|
qwen25_vl_video_vqa,
|
3761
|
-
|
3571
|
+
agentic_activity_recognition,
|
3762
3572
|
depth_anything_v2,
|
3763
3573
|
generate_pose_image,
|
3764
3574
|
vit_nsfw_classification,
|
3765
|
-
flux_image_inpainting,
|
3766
3575
|
siglip_classification,
|
3767
3576
|
minimum_distance,
|
3768
3577
|
]
|
@@ -1,14 +1,14 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
3
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
2
|
+
vision_agent/.sim_tools/df.csv,sha256=i732_U1KQf55UNhT-9srtZXF91XvDnfWBDdc8EqDmpw,41215
|
3
|
+
vision_agent/.sim_tools/embs.npy,sha256=XCu3LnLS10IS3npfPMqX2VHIbDPq9iY_NPDBwq5AEj0,245888
|
4
4
|
vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
|
6
6
|
vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
|
8
8
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=ELc_J8Q4NKPs7YETu3a9O0Vk1zN3k6QfHBgu0M0IWGk,17450
|
9
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
9
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=O24BpRhMRZx7D_WdaRv-a2K6fLpin0o7oWxlvL70WpM,35944
|
10
10
|
vision_agent/agent/vision_agent_planner_v2.py,sha256=Aww_BJhTFKZ5XjYe8FW57z2Gwp2se0vg1t1DKLGRAyQ,22050
|
11
|
-
vision_agent/agent/vision_agent_prompts_v2.py,sha256=
|
11
|
+
vision_agent/agent/vision_agent_prompts_v2.py,sha256=NG1xnZvZGi4DcqdfqZCkPkS7oka3gr6h42ekUKUKcqY,4231
|
12
12
|
vision_agent/agent/vision_agent_v2.py,sha256=iPW6DowH7wCFIA5vb1SdSLfZFWbn_oSC7Xa8uO8KIJI,11675
|
13
13
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
14
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
|
|
26
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
27
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
28
28
|
vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=zf8HzjcMSgxKhtrxbqYe9hmvsfuweeDMrOc8eVA8Ya8,2477
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=i9GGGu8tvo2M6O5fF4UUBTpn_Ul2KEN9mG3ZlJ95qao,124929
|
34
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
35
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
36
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
|
|
40
40
|
vision_agent/utils/tools_doc.py,sha256=PKcXXbJktiuPi9q6Q1zXzFx24Dh229SNgWBDtZ2fQSQ,2730
|
41
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
42
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
43
|
-
vision_agent-1.1.
|
44
|
-
vision_agent-1.1.
|
45
|
-
vision_agent-1.1.
|
46
|
-
vision_agent-1.1.
|
43
|
+
vision_agent-1.1.15.dist-info/METADATA,sha256=EkYUNPMuq2WuDoBFVhKMT9H06z7-wzjWjV4EQGeIf8E,12673
|
44
|
+
vision_agent-1.1.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
45
|
+
vision_agent-1.1.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
46
|
+
vision_agent-1.1.15.dist-info/RECORD,,
|
File without changes
|
File without changes
|