vision-agent 0.2.231__tar.gz → 0.2.232__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.231 → vision_agent-0.2.232}/PKG-INFO +1 -1
- {vision_agent-0.2.231 → vision_agent-0.2.232}/pyproject.toml +1 -1
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/.sim_tools/df.csv +12 -10
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/agent_utils.py +4 -2
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_v2.py +15 -1
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts_v2.py +12 -8
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_v2.py +1 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts_v2.py +4 -1
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_v2.py +5 -2
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/planner_tools.py +33 -13
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/tools.py +44 -18
- {vision_agent-0.2.231 → vision_agent-0.2.232}/LICENSE +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/README.md +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/anthropic_config.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/anthropic_openai_config.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/config.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/openai_config.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/video.py +0 -0
- {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/video_tracking.py +0 -0
@@ -514,7 +514,7 @@ desc,doc,name
|
|
514
514
|
>>> vit_nsfw_classification(image)
|
515
515
|
{""label"": ""normal"", ""scores"": 0.68},
|
516
516
|
",vit_nsfw_classification
|
517
|
-
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames:
|
517
|
+
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
|
518
518
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
519
519
|
value selected for the video. It can detect multiple objects independently per
|
520
520
|
chunk_length_frames given a text prompt such as a referring expression
|
@@ -527,7 +527,7 @@ desc,doc,name
|
|
527
527
|
frames (List[np.ndarray]): The reference frames used for the question
|
528
528
|
model (str): The model to use for the inference. Valid values are
|
529
529
|
'qwen2vl', 'gpt4o'.
|
530
|
-
chunk_length_frames (
|
530
|
+
chunk_length_frames (int): length of each chunk in frames
|
531
531
|
|
532
532
|
Returns:
|
533
533
|
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
@@ -540,16 +540,18 @@ desc,doc,name
|
|
540
540
|
",video_temporal_localization
|
541
541
|
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
542
542
|
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
543
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
544
|
-
It can be used to edit regions of an image according to the prompt
|
543
|
+
given by mask, in the image, given image based on the text prompt and surrounding
|
544
|
+
image context. It can be used to edit regions of an image according to the prompt
|
545
|
+
given.
|
545
546
|
|
546
547
|
Parameters:
|
547
548
|
prompt (str): A detailed text description guiding what should be generated
|
548
|
-
in the masked area. More detailed and specific prompts typically yield
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
549
|
+
in the masked area. More detailed and specific prompts typically yield
|
550
|
+
better results.
|
551
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
552
|
+
the base context for the inpainting process.
|
553
|
+
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
554
|
+
areas to be inpainted and 0 indicates areas to be preserved.
|
553
555
|
|
554
556
|
Returns:
|
555
557
|
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
@@ -658,7 +660,7 @@ desc,doc,name
|
|
658
660
|
-------
|
659
661
|
>>> save_image(image)
|
660
662
|
",save_image
|
661
|
-
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float =
|
663
|
+
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
|
662
664
|
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
663
665
|
|
664
666
|
Parameters:
|
@@ -148,8 +148,10 @@ def format_plan_v2(plan: PlanContext) -> str:
|
|
148
148
|
plan_str += "Instructions:\n"
|
149
149
|
for v in plan.instructions:
|
150
150
|
plan_str += f" - {v}\n"
|
151
|
-
|
152
|
-
|
151
|
+
|
152
|
+
if plan.code:
|
153
|
+
plan_str += "Code:\n"
|
154
|
+
plan_str += plan.code
|
153
155
|
return plan_str
|
154
156
|
|
155
157
|
|
{vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
@@ -6,7 +6,7 @@ FEEDBACK = """
|
|
6
6
|
|
7
7
|
|
8
8
|
CODE = """
|
9
|
-
**Role**: You are an
|
9
|
+
**Role**: You are an expert software programmer.
|
10
10
|
|
11
11
|
**Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
|
12
12
|
|
@@ -425,6 +425,8 @@ class VisionAgentCoderV2(AgentCoder):
|
|
425
425
|
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
426
426
|
AgentMessage objects.
|
427
427
|
plan_context (PlanContext): The plan context that was previously generated.
|
428
|
+
If plan_context.code is not provided, then the code will be generated
|
429
|
+
from the chat messages.
|
428
430
|
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
429
431
|
|
430
432
|
Returns:
|
@@ -455,12 +457,24 @@ class VisionAgentCoderV2(AgentCoder):
|
|
455
457
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
456
458
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
457
459
|
|
460
|
+
# If code is not provided from the plan_context then generate it, else use
|
461
|
+
# the provided code and start with testing
|
462
|
+
if not plan_context.code.strip():
|
463
|
+
code = write_code(
|
464
|
+
coder=self.coder,
|
465
|
+
chat=int_chat,
|
466
|
+
tool_docs=tool_docs,
|
467
|
+
plan=format_plan_v2(plan_context),
|
468
|
+
)
|
469
|
+
else:
|
470
|
+
code = plan_context.code
|
471
|
+
|
458
472
|
code_context = test_code(
|
459
473
|
tester=self.tester,
|
460
474
|
debugger=self.debugger,
|
461
475
|
chat=int_chat,
|
462
476
|
plan=format_plan_v2(plan_context),
|
463
|
-
code=
|
477
|
+
code=code,
|
464
478
|
tool_docs=tool_docs,
|
465
479
|
code_interpreter=code_interpreter,
|
466
480
|
media_list=media_list,
|
{vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
@@ -458,6 +458,8 @@ You are given a task: "{task}" from the user. You must extract the type of categ
|
|
458
458
|
- "DocQA" - answering questions about a document or extracting information from a document.
|
459
459
|
- "video object tracking" - tracking objects in a video.
|
460
460
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
461
|
+
- "temporal localization" - localizing the time period an event occurs in a video.
|
462
|
+
- "inpainting" - filling in masked parts of an image.
|
461
463
|
|
462
464
|
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
|
463
465
|
"""
|
@@ -651,22 +653,24 @@ PICK_TOOL = """
|
|
651
653
|
"""
|
652
654
|
|
653
655
|
FINALIZE_PLAN = """
|
654
|
-
**
|
656
|
+
**Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
|
655
657
|
|
656
|
-
**
|
658
|
+
**Documentation**: You can use these tools to help you visualize or save the output:
|
659
|
+
{tool_desc}
|
657
660
|
|
658
661
|
**Planning**: Here is chain of thoughts, executions and observations from the planning agent:
|
659
662
|
{planning}
|
660
663
|
|
661
664
|
**Instructions**:
|
662
665
|
1. Summarize the plan that the planning agent found.
|
663
|
-
2. Write a single function that solves the problem based on what the planner found.
|
664
|
-
3.
|
666
|
+
2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
|
667
|
+
3. Only use tools obtained from calling `get_tool_for_task`.
|
665
668
|
4. Do not include {excluded_tools} tools in your instructions.
|
666
|
-
5.
|
667
|
-
6.
|
668
|
-
7.
|
669
|
-
8.
|
669
|
+
5. Ensure the function is well documented and easy to understand.
|
670
|
+
6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
|
671
|
+
7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
672
|
+
8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
|
673
|
+
9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
670
674
|
|
671
675
|
<json>
|
672
676
|
{{
|
@@ -326,6 +326,7 @@ def create_finalize_plan(
|
|
326
326
|
return [], PlanContext(plan="", instructions=[], code="")
|
327
327
|
|
328
328
|
prompt = FINALIZE_PLAN.format(
|
329
|
+
tool_desc=UTIL_DOCSTRING,
|
329
330
|
planning=get_planning(chat),
|
330
331
|
excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
|
331
332
|
)
|
@@ -42,6 +42,8 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
|
|
42
42
|
- Understanding documents
|
43
43
|
- Pose estimation
|
44
44
|
- Visual question answering for both images and videos
|
45
|
+
- Action recognition in videos
|
46
|
+
- Image inpainting
|
45
47
|
|
46
48
|
How can I help you?</response>
|
47
49
|
--- END EXAMPLE2 ---
|
@@ -54,7 +56,8 @@ Here is the current conversation so far:
|
|
54
56
|
|
55
57
|
**Instructions**:
|
56
58
|
1. Only respond with a single <response> tag and a single <action> tag.
|
57
|
-
2.
|
59
|
+
2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
|
60
|
+
3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
|
58
61
|
|
59
62
|
<response>Your response to the user's message</response>
|
60
63
|
<action>The action you want to take from **Actions**</action>
|
@@ -112,14 +112,17 @@ def maybe_run_action(
|
|
112
112
|
)
|
113
113
|
]
|
114
114
|
elif action == "edit_code":
|
115
|
+
# We don't want to pass code in plan_context.code so the coder will generate
|
116
|
+
# new code from plan_context.plan
|
115
117
|
plan_context = PlanContext(
|
116
|
-
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
|
118
|
+
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
|
119
|
+
+ ("<code>\n" + final_code + "\n</code>" if final_code is not None else ""),
|
117
120
|
instructions=[
|
118
121
|
chat_i.content
|
119
122
|
for chat_i in extracted_chat
|
120
123
|
if chat_i.role == "user" and "<final_code>" not in chat_i.content
|
121
124
|
],
|
122
|
-
code=
|
125
|
+
code="",
|
123
126
|
)
|
124
127
|
context = coder.generate_code_from_plan(
|
125
128
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
@@ -2,7 +2,7 @@ import inspect
|
|
2
2
|
import logging
|
3
3
|
import tempfile
|
4
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, cast
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
6
6
|
|
7
7
|
import libcst as cst
|
8
8
|
import numpy as np
|
@@ -235,7 +235,9 @@ def run_tool_testing(
|
|
235
235
|
|
236
236
|
|
237
237
|
def get_tool_for_task(
|
238
|
-
task: str,
|
238
|
+
task: str,
|
239
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
|
240
|
+
exclude_tools: Optional[List[str]] = None,
|
239
241
|
) -> None:
|
240
242
|
"""Given a task and one or more images this function will find a tool to accomplish
|
241
243
|
the jobs. It prints the tool documentation and thoughts on why it chose the tool.
|
@@ -248,6 +250,8 @@ def get_tool_for_task(
|
|
248
250
|
- VQA
|
249
251
|
- Depth and pose estimation
|
250
252
|
- Video object tracking
|
253
|
+
- Video temporal localization (action recognition)
|
254
|
+
- Image inpainting
|
251
255
|
|
252
256
|
Only ask for one type of task at a time, for example a task needing to identify
|
253
257
|
text is one OCR task while needing to identify non-text objects is an OD task. Wait
|
@@ -256,7 +260,8 @@ def get_tool_for_task(
|
|
256
260
|
|
257
261
|
Parameters:
|
258
262
|
task: str: The task to accomplish.
|
259
|
-
images: List[np.ndarray]: The images to use
|
263
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
|
264
|
+
for the task. If a key is provided, it is used as the file name.
|
260
265
|
exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
|
261
266
|
recommendations. This is helpful if you are calling get_tool_for_task twice
|
262
267
|
and do not want the same tool recommended.
|
@@ -266,20 +271,29 @@ def get_tool_for_task(
|
|
266
271
|
|
267
272
|
Examples
|
268
273
|
--------
|
269
|
-
>>> get_tool_for_task(
|
274
|
+
>>> get_tool_for_task(
|
275
|
+
>>> "Give me an OCR model that can find 'hot chocolate' in the image",
|
276
|
+
>>> {"image": [image]})
|
277
|
+
>>> get_tool_for_taks(
|
278
|
+
>>> "I need a tool that can paint a background for this image and maks",
|
279
|
+
>>> {"image": [image], "mask": [mask]})
|
270
280
|
"""
|
271
281
|
tool_tester = CONFIG.create_tool_tester()
|
272
282
|
tool_chooser = CONFIG.create_tool_chooser()
|
273
283
|
|
284
|
+
if isinstance(images, list):
|
285
|
+
images = {"image": images}
|
286
|
+
|
274
287
|
with (
|
275
288
|
tempfile.TemporaryDirectory() as tmpdirname,
|
276
289
|
CodeInterpreterFactory.new_instance() as code_interpreter,
|
277
290
|
):
|
278
291
|
image_paths = []
|
279
|
-
for
|
280
|
-
|
281
|
-
|
282
|
-
|
292
|
+
for k in images.keys():
|
293
|
+
for i, image in enumerate(images[k]):
|
294
|
+
image_path = f"{tmpdirname}/{k}_{i}.png"
|
295
|
+
Image.fromarray(image).save(image_path)
|
296
|
+
image_paths.append(image_path)
|
283
297
|
|
284
298
|
code, tool_docs_str, tool_output = run_tool_testing(
|
285
299
|
task, image_paths, tool_tester, exclude_tools, code_interpreter
|
@@ -300,20 +314,26 @@ def get_tool_documentation(tool_name: str) -> str:
|
|
300
314
|
|
301
315
|
|
302
316
|
def get_tool_for_task_human_reviewer(
|
303
|
-
task: str,
|
317
|
+
task: str,
|
318
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
|
319
|
+
exclude_tools: Optional[List[str]] = None,
|
304
320
|
) -> None:
|
305
321
|
# NOTE: this will have the same documentation as get_tool_for_task
|
306
322
|
tool_tester = CONFIG.create_tool_tester()
|
307
323
|
|
324
|
+
if isinstance(images, list):
|
325
|
+
images = {"image": images}
|
326
|
+
|
308
327
|
with (
|
309
328
|
tempfile.TemporaryDirectory() as tmpdirname,
|
310
329
|
CodeInterpreterFactory.new_instance() as code_interpreter,
|
311
330
|
):
|
312
331
|
image_paths = []
|
313
|
-
for
|
314
|
-
|
315
|
-
|
316
|
-
|
332
|
+
for k in images.keys():
|
333
|
+
for i, image in enumerate(images[k]):
|
334
|
+
image_path = f"{tmpdirname}/{k}_{i}.png"
|
335
|
+
Image.fromarray(image).save(image_path)
|
336
|
+
image_paths.append(image_path)
|
317
337
|
|
318
338
|
tools = [
|
319
339
|
t.__name__
|
@@ -1727,22 +1727,46 @@ def video_temporal_localization(
|
|
1727
1727
|
}
|
1728
1728
|
payload["chunk_length_frames"] = chunk_length_frames
|
1729
1729
|
|
1730
|
-
|
1731
|
-
|
1732
|
-
|
1730
|
+
segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
|
1731
|
+
|
1732
|
+
def _apply_temporal_localization(
|
1733
|
+
segment: List[np.ndarray],
|
1734
|
+
) -> List[float]:
|
1735
|
+
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
1736
|
+
data = send_inference_request(
|
1737
|
+
payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
|
1738
|
+
)
|
1739
|
+
chunked_data = [cast(float, value) for value in data]
|
1740
|
+
|
1741
|
+
full_data = []
|
1742
|
+
for value in chunked_data:
|
1743
|
+
full_data.extend([value] * chunk_length_frames)
|
1744
|
+
|
1745
|
+
return full_data[: len(segment)]
|
1746
|
+
|
1747
|
+
with ThreadPoolExecutor() as executor:
|
1748
|
+
futures = {
|
1749
|
+
executor.submit(_apply_temporal_localization, segment): segment_index
|
1750
|
+
for segment_index, segment in enumerate(segments)
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
localization_per_segment = []
|
1754
|
+
for future in as_completed(futures):
|
1755
|
+
segment_index = futures[future]
|
1756
|
+
localization_per_segment.append((segment_index, future.result()))
|
1757
|
+
|
1758
|
+
localization_per_segment = [
|
1759
|
+
x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
|
1760
|
+
]
|
1761
|
+
localizations = cast(List[float], [e for o in localization_per_segment for e in o])
|
1762
|
+
|
1733
1763
|
_display_tool_trace(
|
1734
1764
|
video_temporal_localization.__name__,
|
1735
1765
|
payload,
|
1736
|
-
|
1766
|
+
localization_per_segment,
|
1737
1767
|
files,
|
1738
1768
|
)
|
1739
|
-
|
1740
|
-
|
1741
|
-
full_data = []
|
1742
|
-
for value in chunked_data:
|
1743
|
-
full_data.extend([value] * chunk_length_frames)
|
1744
|
-
|
1745
|
-
return full_data[: len(frames)]
|
1769
|
+
return localizations
|
1746
1770
|
|
1747
1771
|
|
1748
1772
|
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
@@ -2028,16 +2052,18 @@ def flux_image_inpainting(
|
|
2028
2052
|
mask: np.ndarray,
|
2029
2053
|
) -> np.ndarray:
|
2030
2054
|
"""'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
2031
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
2032
|
-
It can be used to edit regions of an image according to the prompt
|
2055
|
+
given by mask, in the image, given image based on the text prompt and surrounding
|
2056
|
+
image context. It can be used to edit regions of an image according to the prompt
|
2057
|
+
given.
|
2033
2058
|
|
2034
2059
|
Parameters:
|
2035
2060
|
prompt (str): A detailed text description guiding what should be generated
|
2036
|
-
in the masked area. More detailed and specific prompts typically yield
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
|
2061
|
+
in the masked area. More detailed and specific prompts typically yield
|
2062
|
+
better results.
|
2063
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
2064
|
+
the base context for the inpainting process.
|
2065
|
+
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
2066
|
+
areas to be inpainted and 0 indicates areas to be preserved.
|
2041
2067
|
|
2042
2068
|
Returns:
|
2043
2069
|
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/anthropic_openai_config.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|