vision-agent 0.2.231__tar.gz → 0.2.232__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {vision_agent-0.2.231 → vision_agent-0.2.232}/PKG-INFO +1 -1
  2. {vision_agent-0.2.231 → vision_agent-0.2.232}/pyproject.toml +1 -1
  3. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/.sim_tools/df.csv +12 -10
  4. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/agent_utils.py +4 -2
  5. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
  6. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_v2.py +15 -1
  7. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts_v2.py +12 -8
  8. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_v2.py +1 -0
  9. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts_v2.py +4 -1
  10. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_v2.py +5 -2
  11. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/planner_tools.py +33 -13
  12. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/tools.py +44 -18
  13. {vision_agent-0.2.231 → vision_agent-0.2.232}/LICENSE +0 -0
  14. {vision_agent-0.2.231 → vision_agent-0.2.232}/README.md +0 -0
  15. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/.sim_tools/embs.npy +0 -0
  16. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/__init__.py +0 -0
  17. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/README.md +0 -0
  18. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/__init__.py +0 -0
  19. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/agent.py +0 -0
  20. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/types.py +0 -0
  21. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent.py +0 -0
  22. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder.py +0 -0
  23. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  24. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner.py +0 -0
  25. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  26. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts.py +0 -0
  27. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/clients/__init__.py +0 -0
  28. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/clients/http.py +0 -0
  29. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/clients/landing_public_api.py +0 -0
  30. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/__init__.py +0 -0
  31. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/anthropic_config.py +0 -0
  32. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/anthropic_openai_config.py +0 -0
  33. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/config.py +0 -0
  34. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/configs/openai_config.py +0 -0
  35. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/fonts/__init__.py +0 -0
  36. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  37. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/lmm/__init__.py +0 -0
  38. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/lmm/lmm.py +0 -0
  39. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/lmm/types.py +0 -0
  40. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/__init__.py +0 -0
  41. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/meta_tools.py +0 -0
  42. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/prompts.py +0 -0
  43. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/tool_utils.py +0 -0
  44. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/tools/tools_types.py +0 -0
  45. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/__init__.py +0 -0
  46. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/exceptions.py +0 -0
  47. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/execute.py +0 -0
  48. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/image_utils.py +0 -0
  49. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/sim.py +0 -0
  50. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/type_defs.py +0 -0
  51. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/video.py +0 -0
  52. {vision_agent-0.2.231 → vision_agent-0.2.232}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.231
3
+ Version: 0.2.232
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.231"
7
+ version = "0.2.232"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -514,7 +514,7 @@ desc,doc,name
514
514
  >>> vit_nsfw_classification(image)
515
515
  {""label"": ""normal"", ""scores"": 0.68},
516
516
  ",vit_nsfw_classification
517
- 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
517
+ 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
518
518
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
519
519
  value selected for the video. It can detect multiple objects independently per
520
520
  chunk_length_frames given a text prompt such as a referring expression
@@ -527,7 +527,7 @@ desc,doc,name
527
527
  frames (List[np.ndarray]): The reference frames used for the question
528
528
  model (str): The model to use for the inference. Valid values are
529
529
  'qwen2vl', 'gpt4o'.
530
- chunk_length_frames (Optional[int]): length of each chunk in frames
530
+ chunk_length_frames (int): length of each chunk in frames
531
531
 
532
532
  Returns:
533
533
  List[float]: A list of floats with a value of 1.0 if the objects to be found
@@ -540,16 +540,18 @@ desc,doc,name
540
540
  ",video_temporal_localization
541
541
  "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
542
542
  'flux_image_inpainting' performs image inpainting to fill the masked regions,
543
- given by mask, in the image, given image based on the text prompt and surrounding image context.
544
- It can be used to edit regions of an image according to the prompt given.
543
+ given by mask, in the image, given image based on the text prompt and surrounding
544
+ image context. It can be used to edit regions of an image according to the prompt
545
+ given.
545
546
 
546
547
  Parameters:
547
548
  prompt (str): A detailed text description guiding what should be generated
548
- in the masked area. More detailed and specific prompts typically yield better results.
549
- image (np.ndarray): The source image to be inpainted.
550
- The image will serve as the base context for the inpainting process.
551
- mask (np.ndarray): A binary mask image with 0's and 1's,
552
- where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
549
+ in the masked area. More detailed and specific prompts typically yield
550
+ better results.
551
+ image (np.ndarray): The source image to be inpainted. The image will serve as
552
+ the base context for the inpainting process.
553
+ mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
554
+ areas to be inpainted and 0 indicates areas to be preserved.
553
555
 
554
556
  Returns:
555
557
  np.ndarray: The generated image(s) as a numpy array in RGB format with values
@@ -658,7 +660,7 @@ desc,doc,name
658
660
  -------
659
661
  >>> save_image(image)
660
662
  ",save_image
661
- 'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 1) -> str:
663
+ 'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
662
664
  'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
663
665
 
664
666
  Parameters:
@@ -148,8 +148,10 @@ def format_plan_v2(plan: PlanContext) -> str:
148
148
  plan_str += "Instructions:\n"
149
149
  for v in plan.instructions:
150
150
  plan_str += f" - {v}\n"
151
- plan_str += "Code:\n"
152
- plan_str += plan.code
151
+
152
+ if plan.code:
153
+ plan_str += "Code:\n"
154
+ plan_str += plan.code
153
155
  return plan_str
154
156
 
155
157
 
@@ -6,7 +6,7 @@ FEEDBACK = """
6
6
 
7
7
 
8
8
  CODE = """
9
- **Role**: You are an expoert software programmer.
9
+ **Role**: You are an expert software programmer.
10
10
 
11
11
  **Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
12
12
 
@@ -425,6 +425,8 @@ class VisionAgentCoderV2(AgentCoder):
425
425
  chat (List[AgentMessage]): The input to the agent. This should be a list of
426
426
  AgentMessage objects.
427
427
  plan_context (PlanContext): The plan context that was previously generated.
428
+ If plan_context.code is not provided, then the code will be generated
429
+ from the chat messages.
428
430
  code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
429
431
 
430
432
  Returns:
@@ -455,12 +457,24 @@ class VisionAgentCoderV2(AgentCoder):
455
457
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
456
458
  tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
457
459
 
460
+ # If code is not provided from the plan_context then generate it, else use
461
+ # the provided code and start with testing
462
+ if not plan_context.code.strip():
463
+ code = write_code(
464
+ coder=self.coder,
465
+ chat=int_chat,
466
+ tool_docs=tool_docs,
467
+ plan=format_plan_v2(plan_context),
468
+ )
469
+ else:
470
+ code = plan_context.code
471
+
458
472
  code_context = test_code(
459
473
  tester=self.tester,
460
474
  debugger=self.debugger,
461
475
  chat=int_chat,
462
476
  plan=format_plan_v2(plan_context),
463
- code=plan_context.code,
477
+ code=code,
464
478
  tool_docs=tool_docs,
465
479
  code_interpreter=code_interpreter,
466
480
  media_list=media_list,
@@ -458,6 +458,8 @@ You are given a task: "{task}" from the user. You must extract the type of categ
458
458
  - "DocQA" - answering questions about a document or extracting information from a document.
459
459
  - "video object tracking" - tracking objects in a video.
460
460
  - "depth and pose estimation" - estimating the depth or pose of objects in an image.
461
+ - "temporal localization" - localizing the time period an event occurs in a video.
462
+ - "inpainting" - filling in masked parts of an image.
461
463
 
462
464
  Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
463
465
  """
@@ -651,22 +653,24 @@ PICK_TOOL = """
651
653
  """
652
654
 
653
655
  FINALIZE_PLAN = """
654
- **Role**: You are an expert AI model that can understand the user request and construct plans to accomplish it.
656
+ **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
655
657
 
656
- **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agnet to write a program to accomplish the user request.
658
+ **Documentation**: You can use these tools to help you visualize or save the output:
659
+ {tool_desc}
657
660
 
658
661
  **Planning**: Here is chain of thoughts, executions and observations from the planning agent:
659
662
  {planning}
660
663
 
661
664
  **Instructions**:
662
665
  1. Summarize the plan that the planning agent found.
663
- 2. Write a single function that solves the problem based on what the planner found.
664
- 3. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
666
+ 2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
667
+ 3. Only use tools obtained from calling `get_tool_for_task`.
665
668
  4. Do not include {excluded_tools} tools in your instructions.
666
- 5. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_image` or `save_video`.
667
- 6. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
668
- 7. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
669
- 8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
669
+ 5. Ensure the function is well documented and easy to understand.
670
+ 6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
671
+ 7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
672
+ 8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
673
+ 9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
670
674
 
671
675
  <json>
672
676
  {{
@@ -326,6 +326,7 @@ def create_finalize_plan(
326
326
  return [], PlanContext(plan="", instructions=[], code="")
327
327
 
328
328
  prompt = FINALIZE_PLAN.format(
329
+ tool_desc=UTIL_DOCSTRING,
329
330
  planning=get_planning(chat),
330
331
  excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
331
332
  )
@@ -42,6 +42,8 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
42
42
  - Understanding documents
43
43
  - Pose estimation
44
44
  - Visual question answering for both images and videos
45
+ - Action recognition in videos
46
+ - Image inpainting
45
47
 
46
48
  How can I help you?</response>
47
49
  --- END EXAMPLE2 ---
@@ -54,7 +56,8 @@ Here is the current conversation so far:
54
56
 
55
57
  **Instructions**:
56
58
  1. Only respond with a single <response> tag and a single <action> tag.
57
- 2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
59
+ 2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
60
+ 3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
58
61
 
59
62
  <response>Your response to the user's message</response>
60
63
  <action>The action you want to take from **Actions**</action>
@@ -112,14 +112,17 @@ def maybe_run_action(
112
112
  )
113
113
  ]
114
114
  elif action == "edit_code":
115
+ # We don't want to pass code in plan_context.code so the coder will generate
116
+ # new code from plan_context.plan
115
117
  plan_context = PlanContext(
116
- plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
118
+ plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
119
+ + ("<code>\n" + final_code + "\n</code>" if final_code is not None else ""),
117
120
  instructions=[
118
121
  chat_i.content
119
122
  for chat_i in extracted_chat
120
123
  if chat_i.role == "user" and "<final_code>" not in chat_i.content
121
124
  ],
122
- code=final_code if final_code is not None else "",
125
+ code="",
123
126
  )
124
127
  context = coder.generate_code_from_plan(
125
128
  extracted_chat, plan_context, code_interpreter=code_interpreter
@@ -2,7 +2,7 @@ import inspect
2
2
  import logging
3
3
  import tempfile
4
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from typing import Any, Callable, Dict, List, Optional, Tuple, cast
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
6
6
 
7
7
  import libcst as cst
8
8
  import numpy as np
@@ -235,7 +235,9 @@ def run_tool_testing(
235
235
 
236
236
 
237
237
  def get_tool_for_task(
238
- task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
238
+ task: str,
239
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
240
+ exclude_tools: Optional[List[str]] = None,
239
241
  ) -> None:
240
242
  """Given a task and one or more images this function will find a tool to accomplish
241
243
  the jobs. It prints the tool documentation and thoughts on why it chose the tool.
@@ -248,6 +250,8 @@ def get_tool_for_task(
248
250
  - VQA
249
251
  - Depth and pose estimation
250
252
  - Video object tracking
253
+ - Video temporal localization (action recognition)
254
+ - Image inpainting
251
255
 
252
256
  Only ask for one type of task at a time, for example a task needing to identify
253
257
  text is one OCR task while needing to identify non-text objects is an OD task. Wait
@@ -256,7 +260,8 @@ def get_tool_for_task(
256
260
 
257
261
  Parameters:
258
262
  task: str: The task to accomplish.
259
- images: List[np.ndarray]: The images to use for the task.
263
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
264
+ for the task. If a key is provided, it is used as the file name.
260
265
  exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
261
266
  recommendations. This is helpful if you are calling get_tool_for_task twice
262
267
  and do not want the same tool recommended.
@@ -266,20 +271,29 @@ def get_tool_for_task(
266
271
 
267
272
  Examples
268
273
  --------
269
- >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
274
+ >>> get_tool_for_task(
275
+ >>> "Give me an OCR model that can find 'hot chocolate' in the image",
276
+ >>> {"image": [image]})
277
+ >>> get_tool_for_taks(
278
+ >>> "I need a tool that can paint a background for this image and maks",
279
+ >>> {"image": [image], "mask": [mask]})
270
280
  """
271
281
  tool_tester = CONFIG.create_tool_tester()
272
282
  tool_chooser = CONFIG.create_tool_chooser()
273
283
 
284
+ if isinstance(images, list):
285
+ images = {"image": images}
286
+
274
287
  with (
275
288
  tempfile.TemporaryDirectory() as tmpdirname,
276
289
  CodeInterpreterFactory.new_instance() as code_interpreter,
277
290
  ):
278
291
  image_paths = []
279
- for i, image in enumerate(images[:3]):
280
- image_path = f"{tmpdirname}/image_{i}.png"
281
- Image.fromarray(image).save(image_path)
282
- image_paths.append(image_path)
292
+ for k in images.keys():
293
+ for i, image in enumerate(images[k]):
294
+ image_path = f"{tmpdirname}/{k}_{i}.png"
295
+ Image.fromarray(image).save(image_path)
296
+ image_paths.append(image_path)
283
297
 
284
298
  code, tool_docs_str, tool_output = run_tool_testing(
285
299
  task, image_paths, tool_tester, exclude_tools, code_interpreter
@@ -300,20 +314,26 @@ def get_tool_documentation(tool_name: str) -> str:
300
314
 
301
315
 
302
316
  def get_tool_for_task_human_reviewer(
303
- task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
317
+ task: str,
318
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
319
+ exclude_tools: Optional[List[str]] = None,
304
320
  ) -> None:
305
321
  # NOTE: this will have the same documentation as get_tool_for_task
306
322
  tool_tester = CONFIG.create_tool_tester()
307
323
 
324
+ if isinstance(images, list):
325
+ images = {"image": images}
326
+
308
327
  with (
309
328
  tempfile.TemporaryDirectory() as tmpdirname,
310
329
  CodeInterpreterFactory.new_instance() as code_interpreter,
311
330
  ):
312
331
  image_paths = []
313
- for i, image in enumerate(images[:3]):
314
- image_path = f"{tmpdirname}/image_{i}.png"
315
- Image.fromarray(image).save(image_path)
316
- image_paths.append(image_path)
332
+ for k in images.keys():
333
+ for i, image in enumerate(images[k]):
334
+ image_path = f"{tmpdirname}/{k}_{i}.png"
335
+ Image.fromarray(image).save(image_path)
336
+ image_paths.append(image_path)
317
337
 
318
338
  tools = [
319
339
  t.__name__
@@ -1727,22 +1727,46 @@ def video_temporal_localization(
1727
1727
  }
1728
1728
  payload["chunk_length_frames"] = chunk_length_frames
1729
1729
 
1730
- data = send_inference_request(
1731
- payload, "video-temporal-localization", files=files, v2=True
1732
- )
1730
+ segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
1731
+
1732
+ def _apply_temporal_localization(
1733
+ segment: List[np.ndarray],
1734
+ ) -> List[float]:
1735
+ segment_buffer_bytes = [("video", frames_to_bytes(segment))]
1736
+ data = send_inference_request(
1737
+ payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
1738
+ )
1739
+ chunked_data = [cast(float, value) for value in data]
1740
+
1741
+ full_data = []
1742
+ for value in chunked_data:
1743
+ full_data.extend([value] * chunk_length_frames)
1744
+
1745
+ return full_data[: len(segment)]
1746
+
1747
+ with ThreadPoolExecutor() as executor:
1748
+ futures = {
1749
+ executor.submit(_apply_temporal_localization, segment): segment_index
1750
+ for segment_index, segment in enumerate(segments)
1751
+ }
1752
+
1753
+ localization_per_segment = []
1754
+ for future in as_completed(futures):
1755
+ segment_index = futures[future]
1756
+ localization_per_segment.append((segment_index, future.result()))
1757
+
1758
+ localization_per_segment = [
1759
+ x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
1760
+ ]
1761
+ localizations = cast(List[float], [e for o in localization_per_segment for e in o])
1762
+
1733
1763
  _display_tool_trace(
1734
1764
  video_temporal_localization.__name__,
1735
1765
  payload,
1736
- data,
1766
+ localization_per_segment,
1737
1767
  files,
1738
1768
  )
1739
- chunked_data = [cast(float, value) for value in data]
1740
-
1741
- full_data = []
1742
- for value in chunked_data:
1743
- full_data.extend([value] * chunk_length_frames)
1744
-
1745
- return full_data[: len(frames)]
1769
+ return localizations
1746
1770
 
1747
1771
 
1748
1772
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
@@ -2028,16 +2052,18 @@ def flux_image_inpainting(
2028
2052
  mask: np.ndarray,
2029
2053
  ) -> np.ndarray:
2030
2054
  """'flux_image_inpainting' performs image inpainting to fill the masked regions,
2031
- given by mask, in the image, given image based on the text prompt and surrounding image context.
2032
- It can be used to edit regions of an image according to the prompt given.
2055
+ given by mask, in the image, given image based on the text prompt and surrounding
2056
+ image context. It can be used to edit regions of an image according to the prompt
2057
+ given.
2033
2058
 
2034
2059
  Parameters:
2035
2060
  prompt (str): A detailed text description guiding what should be generated
2036
- in the masked area. More detailed and specific prompts typically yield better results.
2037
- image (np.ndarray): The source image to be inpainted.
2038
- The image will serve as the base context for the inpainting process.
2039
- mask (np.ndarray): A binary mask image with 0's and 1's,
2040
- where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
2061
+ in the masked area. More detailed and specific prompts typically yield
2062
+ better results.
2063
+ image (np.ndarray): The source image to be inpainted. The image will serve as
2064
+ the base context for the inpainting process.
2065
+ mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
2066
+ areas to be inpainted and 0 indicates areas to be preserved.
2041
2067
 
2042
2068
  Returns:
2043
2069
  np.ndarray: The generated image(s) as a numpy array in RGB format with values
File without changes
File without changes