vision-agent 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,7 @@ class DefaultImports:
28
28
  code = [
29
29
  "from typing import *",
30
30
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
31
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
32
32
  ]
33
33
 
34
34
  @staticmethod
@@ -93,7 +93,7 @@ def format_plans(plans: Dict[str, Any]) -> str:
93
93
 
94
94
 
95
95
  def extract_image(
96
- media: Optional[Sequence[Union[str, Path]]]
96
+ media: Optional[Sequence[Union[str, Path]]],
97
97
  ) -> Optional[Sequence[Union[str, Path]]]:
98
98
  if media is None:
99
99
  return None
@@ -186,7 +186,8 @@ def pick_plan(
186
186
  if tool_output.success
187
187
  else "Code execution failed"
188
188
  ),
189
- "payload": tool_output.to_json(),
189
+ "code": DefaultImports.prepend_imports(code),
190
+ # "payload": tool_output.to_json(),
190
191
  "status": "completed" if tool_output.success else "failed",
191
192
  }
192
193
  )
@@ -211,6 +212,9 @@ def pick_plan(
211
212
  }
212
213
  )
213
214
  code = extract_code(model(prompt))
215
+ tool_output = code_interpreter.exec_isolation(
216
+ DefaultImports.prepend_imports(code)
217
+ )
214
218
  log_progress(
215
219
  {
216
220
  "type": "log",
@@ -220,13 +224,10 @@ def pick_plan(
220
224
  else "Code execution failed"
221
225
  ),
222
226
  "code": DefaultImports.prepend_imports(code),
223
- "payload": tool_output.to_json(),
227
+ # "payload": tool_output.to_json(),
224
228
  "status": "completed" if tool_output.success else "failed",
225
229
  }
226
230
  )
227
- tool_output = code_interpreter.exec_isolation(
228
- DefaultImports.prepend_imports(code)
229
- )
230
231
  tool_output_str = ""
231
232
  if len(tool_output.logs.stdout) > 0:
232
233
  tool_output_str = tool_output.logs.stdout[0]
File without changes
@@ -0,0 +1,46 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Dict, Optional
4
+
5
+ from requests import Session
6
+ from requests.adapters import HTTPAdapter
7
+ from requests.exceptions import ConnectionError, RequestException, Timeout
8
+
9
+ _LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ class BaseHTTP:
13
+ _TIMEOUT = 30 # seconds
14
+ _MAX_RETRIES = 3
15
+
16
+ def __init__(
17
+ self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
18
+ ) -> None:
19
+ self._headers = headers
20
+ if headers is None:
21
+ self._headers = {
22
+ "Content-Type": "application/json",
23
+ }
24
+ self._base_endpoint = base_endpoint
25
+ self._session = Session()
26
+ self._session.headers.update(self._headers) # type: ignore
27
+ self._session.mount(
28
+ self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
29
+ )
30
+
31
+ def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
32
+ formatted_url = f"{self._base_endpoint}/{url}"
33
+ _LOGGER.info(f"Sending data to {formatted_url}")
34
+ try:
35
+ response = self._session.post(
36
+ url=formatted_url, json=payload, timeout=self._TIMEOUT
37
+ )
38
+ response.raise_for_status()
39
+ result: Dict[str, Any] = response.json()
40
+ _LOGGER.info(json.dumps(result))
41
+ except (ConnectionError, Timeout, RequestException) as err:
42
+ _LOGGER.warning(f"Error: {err}.")
43
+ except json.JSONDecodeError:
44
+ resp_text = response.text
45
+ _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
46
+ return result
@@ -0,0 +1,26 @@
1
+ import os
2
+ from uuid import UUID
3
+ from typing import List
4
+
5
+ from vision_agent.clients.http import BaseHTTP
6
+ from vision_agent.utils.type_defs import LandingaiAPIKey
7
+ from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
8
+
9
+
10
+ class LandingPublicAPI(BaseHTTP):
11
+ def __init__(self) -> None:
12
+ landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
13
+ landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
14
+ headers = {"Content-Type": "application/json", "apikey": landing_api_key}
15
+ super().__init__(base_endpoint=landing_url, headers=headers)
16
+
17
+ def launch_fine_tuning_job(
18
+ self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
19
+ ) -> UUID:
20
+ url = "v1/agent/jobs/fine-tuning"
21
+ data = {
22
+ "model": {"name": model_name, "task": task.value},
23
+ "bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
24
+ }
25
+ response = self.post(url, payload=data)
26
+ return UUID(response["jobId"])
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING
3
+ from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning
4
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
5
  from .tools import (
6
6
  TOOL_DESCRIPTIONS,
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  import subprocess
3
+ from uuid import UUID
3
4
  from pathlib import Path
4
5
  from typing import Any, Dict, List, Union
5
6
 
@@ -7,6 +8,9 @@ import vision_agent as va
7
8
  from vision_agent.lmm.types import Message
8
9
  from vision_agent.tools.tool_utils import get_tool_documentation
9
10
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
11
+ from vision_agent.utils.image_utils import convert_to_b64
12
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
13
+ from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
10
14
 
11
15
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
12
16
 
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
385
389
  return TOOL_DESCRIPTIONS
386
390
 
387
391
 
392
+ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
393
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
394
+ to detect objects in an image based on a given dataset. It returns the fine
395
+ tuning job id.
396
+
397
+ Parameters:
398
+ bboxes (List[BboxInput]): A list of BboxInput containing the
399
+ image path, labels and bounding boxes.
400
+ task (PromptTask): The florencev2 fine-tuning task. The options are
401
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
402
+
403
+ Returns:
404
+ UUID: The fine tuning job id, this id will used to retrieve the fine
405
+ tuned model.
406
+
407
+ Example
408
+ -------
409
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
410
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
411
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
412
+ "OBJECT_DETECTION"
413
+ )
414
+ """
415
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
416
+ task_input = PromptTask[task]
417
+ fine_tuning_request = [
418
+ BboxInputBase64(
419
+ image=convert_to_b64(bbox_input.image_path),
420
+ filename=bbox_input.image_path.split("/")[-1],
421
+ labels=bbox_input.labels,
422
+ bboxes=bbox_input.bboxes,
423
+ )
424
+ for bbox_input in bboxes_input
425
+ ]
426
+ landing_api = LandingPublicAPI()
427
+ return landing_api.launch_fine_tuning_job(
428
+ "florencev2", task_input, fine_tuning_request
429
+ )
430
+
431
+
388
432
  META_TOOL_DOCSTRING = get_tool_documentation(
389
433
  [
390
434
  get_tool_descriptions,
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
398
442
  search_dir,
399
443
  search_file,
400
444
  find_file,
445
+ florencev2_fine_tuning,
401
446
  ]
402
447
  )
@@ -0,0 +1,30 @@
1
+ from enum import Enum
2
+ from typing import List, Tuple
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class BboxInput(BaseModel):
8
+ image_path: str
9
+ labels: List[str]
10
+ bboxes: List[Tuple[int, int, int, int]]
11
+
12
+
13
+ class BboxInputBase64(BaseModel):
14
+ image: str
15
+ filename: str
16
+ labels: List[str]
17
+ bboxes: List[Tuple[int, int, int, int]]
18
+
19
+
20
+ class PromptTask(str, Enum):
21
+ """
22
+ Valid task prompts options for the Florencev2 model.
23
+ """
24
+
25
+ CAPTION = "<CAPTION>"
26
+ """"""
27
+ CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
28
+ """"""
29
+ OBJECT_DETECTION = "<OD>"
30
+ """"""
@@ -2,23 +2,23 @@ import io
2
2
  import json
3
3
  import logging
4
4
  import tempfile
5
- from importlib import resources
6
5
  from pathlib import Path
6
+ from importlib import resources
7
7
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
8
 
9
9
  import cv2
10
- import numpy as np
11
10
  import requests
11
+ import numpy as np
12
+ from pytube import YouTube # type: ignore
12
13
  from moviepy.editor import ImageSequenceClip
13
14
  from PIL import Image, ImageDraw, ImageFont
14
15
  from pillow_heif import register_heif_opener # type: ignore
15
- from pytube import YouTube # type: ignore
16
16
 
17
17
  from vision_agent.tools.tool_utils import (
18
+ send_inference_request,
18
19
  get_tool_descriptions,
19
20
  get_tool_documentation,
20
21
  get_tools_df,
21
- send_inference_request,
22
22
  )
23
23
  from vision_agent.utils import extract_frames_from_video
24
24
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -1063,7 +1063,6 @@ def save_video(
1063
1063
  if fps <= 0:
1064
1064
  _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
1065
1065
  fps = 4
1066
-
1067
1066
  with ImageSequenceClip(frames, fps=fps) as video:
1068
1067
  if output_video_path:
1069
1068
  f = open(output_video_path, "wb")
@@ -209,7 +209,7 @@ class Result:
209
209
  return formats
210
210
 
211
211
  @staticmethod
212
- def from_e2b_result(result: E2BResult) -> "Result": # type: ignore
212
+ def from_e2b_result(result: E2BResult) -> "Result":
213
213
  """
214
214
  Creates a Result object from an E2BResult object.
215
215
  """
@@ -361,7 +361,7 @@ class Execution(BaseModel):
361
361
  )
362
362
 
363
363
  @staticmethod
364
- def from_e2b_execution(exec: E2BExecution) -> "Execution": # type: ignore
364
+ def from_e2b_execution(exec: E2BExecution) -> "Execution":
365
365
  """Creates an Execution object from an E2BResult object."""
366
366
  return Execution(
367
367
  results=[Result.from_e2b_result(res) for res in exec.results],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.97
3
+ Version: 0.2.99
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -2,28 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9Ks,135
3
3
  vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
4
4
  vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
5
- vision_agent/agent/vision_agent.py,sha256=i_rNpc7faqHTifp2c9sQE4Js3qYUKuJeiqauTp90OlE,8417
6
- vision_agent/agent/vision_agent_coder.py,sha256=M8J5xE9uX8Nig1WmVmwLMeCSe0E6Bg3Mo5fPPcW_a-c,30246
5
+ vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
6
+ vision_agent/agent/vision_agent_coder.py,sha256=N8oVwfxrz6emHlucJC5hGQvkA9cQWW2sMLFtshwLdI8,30309
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
9
+ vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ vision_agent/clients/http.py,sha256=1WMt29F12YFfPH03AttKxnUNXx5sNOD9ZuH4etbB054,1598
11
+ vision_agent/clients/landing_public_api.py,sha256=Tjl8uBZWc3dvrCOKg-PCYjw3RC3X5Y6B50kaKn_QzL0,1050
9
12
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
13
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
11
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
12
15
  vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
13
16
  vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
14
- vision_agent/tools/__init__.py,sha256=UNiaJAOt1C709gaJ-a9h9BzKnY5JmoEUpgKftsOnyPQ,1882
15
- vision_agent/tools/meta_tools.py,sha256=rmxgVzj-vJKeewHbue3qHru4sYsFLxlSZV-YH-eyH5w,13366
17
+ vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
18
+ vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
19
+ vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
16
20
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
17
21
  vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
18
- vision_agent/tools/tools.py,sha256=fHD4qhn7cGG1O77J_BHfaRfW6LMQuj1OIu9xqYu6AG8,43220
22
+ vision_agent/tools/tools.py,sha256=aYo0xSbdr-Q4gq_dKxa8yLyczmXoKv_vYYrZ7dM38bw,43219
19
23
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
20
24
  vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
21
- vision_agent/utils/execute.py,sha256=s43aUtuq7ZNjil2mxrddiz8EvvqlJwttkYlIiZouXqM,25125
25
+ vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
22
26
  vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
23
27
  vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
24
28
  vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
25
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
26
- vision_agent-0.2.97.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
- vision_agent-0.2.97.dist-info/METADATA,sha256=00md0PT29fBJuyXl2LeWcrC3l5T6FXn85YE6Kmat60Q,10728
28
- vision_agent-0.2.97.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
- vision_agent-0.2.97.dist-info/RECORD,,
30
+ vision_agent-0.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.99.dist-info/METADATA,sha256=QDiN7-jSVTpGtrwJLhvSUM1A7aj1baWhZ9eFf1GVn2E,10728
32
+ vision_agent-0.2.99.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.99.dist-info/RECORD,,