vision-agent 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -28,7 +28,7 @@ class DefaultImports:
28
28
  code = [
29
29
  "from typing import *",
30
30
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
31
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
32
32
  ]
33
33
 
34
34
  @staticmethod
@@ -93,7 +93,7 @@ def format_plans(plans: Dict[str, Any]) -> str:
93
93
 
94
94
 
95
95
  def extract_image(
96
- media: Optional[Sequence[Union[str, Path]]]
96
+ media: Optional[Sequence[Union[str, Path]]],
97
97
  ) -> Optional[Sequence[Union[str, Path]]]:
98
98
  if media is None:
99
99
  return None
@@ -186,7 +186,8 @@ def pick_plan(
186
186
  if tool_output.success
187
187
  else "Code execution failed"
188
188
  ),
189
- "payload": tool_output.to_json(),
189
+ "code": DefaultImports.prepend_imports(code),
190
+ # "payload": tool_output.to_json(),
190
191
  "status": "completed" if tool_output.success else "failed",
191
192
  }
192
193
  )
@@ -211,6 +212,9 @@ def pick_plan(
211
212
  }
212
213
  )
213
214
  code = extract_code(model(prompt))
215
+ tool_output = code_interpreter.exec_isolation(
216
+ DefaultImports.prepend_imports(code)
217
+ )
214
218
  log_progress(
215
219
  {
216
220
  "type": "log",
@@ -220,13 +224,10 @@ def pick_plan(
220
224
  else "Code execution failed"
221
225
  ),
222
226
  "code": DefaultImports.prepend_imports(code),
223
- "payload": tool_output.to_json(),
227
+ # "payload": tool_output.to_json(),
224
228
  "status": "completed" if tool_output.success else "failed",
225
229
  }
226
230
  )
227
- tool_output = code_interpreter.exec_isolation(
228
- DefaultImports.prepend_imports(code)
229
- )
230
231
  tool_output_str = ""
231
232
  if len(tool_output.logs.stdout) > 0:
232
233
  tool_output_str = tool_output.logs.stdout[0]
File without changes
@@ -0,0 +1,46 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Dict, Optional
4
+
5
+ from requests import Session
6
+ from requests.adapters import HTTPAdapter
7
+ from requests.exceptions import ConnectionError, RequestException, Timeout
8
+
9
+ _LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ class BaseHTTP:
13
+ _TIMEOUT = 30 # seconds
14
+ _MAX_RETRIES = 3
15
+
16
+ def __init__(
17
+ self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
18
+ ) -> None:
19
+ self._headers = headers
20
+ if headers is None:
21
+ self._headers = {
22
+ "Content-Type": "application/json",
23
+ }
24
+ self._base_endpoint = base_endpoint
25
+ self._session = Session()
26
+ self._session.headers.update(self._headers) # type: ignore
27
+ self._session.mount(
28
+ self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
29
+ )
30
+
31
+ def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
32
+ formatted_url = f"{self._base_endpoint}/{url}"
33
+ _LOGGER.info(f"Sending data to {formatted_url}")
34
+ try:
35
+ response = self._session.post(
36
+ url=formatted_url, json=payload, timeout=self._TIMEOUT
37
+ )
38
+ response.raise_for_status()
39
+ result: Dict[str, Any] = response.json()
40
+ _LOGGER.info(json.dumps(result))
41
+ except (ConnectionError, Timeout, RequestException) as err:
42
+ _LOGGER.warning(f"Error: {err}.")
43
+ except json.JSONDecodeError:
44
+ resp_text = response.text
45
+ _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
46
+ return result
@@ -0,0 +1,26 @@
1
+ import os
2
+ from uuid import UUID
3
+ from typing import List
4
+
5
+ from vision_agent.clients.http import BaseHTTP
6
+ from vision_agent.utils.type_defs import LandingaiAPIKey
7
+ from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
8
+
9
+
10
+ class LandingPublicAPI(BaseHTTP):
11
+ def __init__(self) -> None:
12
+ landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
13
+ landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
14
+ headers = {"Content-Type": "application/json", "apikey": landing_api_key}
15
+ super().__init__(base_endpoint=landing_url, headers=headers)
16
+
17
+ def launch_fine_tuning_job(
18
+ self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
19
+ ) -> UUID:
20
+ url = "v1/agent/jobs/fine-tuning"
21
+ data = {
22
+ "model": {"name": model_name, "task": task.value},
23
+ "bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
24
+ }
25
+ response = self.post(url, payload=data)
26
+ return UUID(response["jobId"])
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING
3
+ from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning
4
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
5
  from .tools import (
6
6
  TOOL_DESCRIPTIONS,
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  import subprocess
3
+ from uuid import UUID
3
4
  from pathlib import Path
4
5
  from typing import Any, Dict, List, Union
5
6
 
@@ -7,6 +8,9 @@ import vision_agent as va
7
8
  from vision_agent.lmm.types import Message
8
9
  from vision_agent.tools.tool_utils import get_tool_documentation
9
10
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
11
+ from vision_agent.utils.image_utils import convert_to_b64
12
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
13
+ from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
10
14
 
11
15
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
12
16
 
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
385
389
  return TOOL_DESCRIPTIONS
386
390
 
387
391
 
392
+ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
393
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
394
+ to detect objects in an image based on a given dataset. It returns the fine
395
+ tuning job id.
396
+
397
+ Parameters:
398
+ bboxes (List[BboxInput]): A list of BboxInput containing the
399
+ image path, labels and bounding boxes.
400
+ task (PromptTask): The florencev2 fine-tuning task. The options are
401
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
402
+
403
+ Returns:
404
+ UUID: The fine tuning job id, this id will used to retrieve the fine
405
+ tuned model.
406
+
407
+ Example
408
+ -------
409
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
410
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
411
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
412
+ "OBJECT_DETECTION"
413
+ )
414
+ """
415
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
416
+ task_input = PromptTask[task]
417
+ fine_tuning_request = [
418
+ BboxInputBase64(
419
+ image=convert_to_b64(bbox_input.image_path),
420
+ filename=bbox_input.image_path.split("/")[-1],
421
+ labels=bbox_input.labels,
422
+ bboxes=bbox_input.bboxes,
423
+ )
424
+ for bbox_input in bboxes_input
425
+ ]
426
+ landing_api = LandingPublicAPI()
427
+ return landing_api.launch_fine_tuning_job(
428
+ "florencev2", task_input, fine_tuning_request
429
+ )
430
+
431
+
388
432
  META_TOOL_DOCSTRING = get_tool_documentation(
389
433
  [
390
434
  get_tool_descriptions,
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
398
442
  search_dir,
399
443
  search_file,
400
444
  find_file,
445
+ florencev2_fine_tuning,
401
446
  ]
402
447
  )
@@ -0,0 +1,30 @@
1
+ from enum import Enum
2
+ from typing import List, Tuple
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class BboxInput(BaseModel):
8
+ image_path: str
9
+ labels: List[str]
10
+ bboxes: List[Tuple[int, int, int, int]]
11
+
12
+
13
+ class BboxInputBase64(BaseModel):
14
+ image: str
15
+ filename: str
16
+ labels: List[str]
17
+ bboxes: List[Tuple[int, int, int, int]]
18
+
19
+
20
+ class PromptTask(str, Enum):
21
+ """
22
+ Valid task prompts options for the Florencev2 model.
23
+ """
24
+
25
+ CAPTION = "<CAPTION>"
26
+ """"""
27
+ CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
28
+ """"""
29
+ OBJECT_DETECTION = "<OD>"
30
+ """"""
@@ -2,23 +2,23 @@ import io
2
2
  import json
3
3
  import logging
4
4
  import tempfile
5
- from importlib import resources
6
5
  from pathlib import Path
6
+ from importlib import resources
7
7
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
8
 
9
9
  import cv2
10
- import numpy as np
11
10
  import requests
11
+ import numpy as np
12
+ from pytube import YouTube # type: ignore
12
13
  from moviepy.editor import ImageSequenceClip
13
14
  from PIL import Image, ImageDraw, ImageFont
14
15
  from pillow_heif import register_heif_opener # type: ignore
15
- from pytube import YouTube # type: ignore
16
16
 
17
17
  from vision_agent.tools.tool_utils import (
18
+ send_inference_request,
18
19
  get_tool_descriptions,
19
20
  get_tool_documentation,
20
21
  get_tools_df,
21
- send_inference_request,
22
22
  )
23
23
  from vision_agent.utils import extract_frames_from_video
24
24
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -1063,7 +1063,6 @@ def save_video(
1063
1063
  if fps <= 0:
1064
1064
  _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
1065
1065
  fps = 4
1066
-
1067
1066
  with ImageSequenceClip(frames, fps=fps) as video:
1068
1067
  if output_video_path:
1069
1068
  f = open(output_video_path, "wb")
@@ -209,7 +209,7 @@ class Result:
209
209
  return formats
210
210
 
211
211
  @staticmethod
212
- def from_e2b_result(result: E2BResult) -> "Result": # type: ignore
212
+ def from_e2b_result(result: E2BResult) -> "Result":
213
213
  """
214
214
  Creates a Result object from an E2BResult object.
215
215
  """
@@ -361,7 +361,7 @@ class Execution(BaseModel):
361
361
  )
362
362
 
363
363
  @staticmethod
364
- def from_e2b_execution(exec: E2BExecution) -> "Execution": # type: ignore
364
+ def from_e2b_execution(exec: E2BExecution) -> "Execution":
365
365
  """Creates an Execution object from an E2BResult object."""
366
366
  return Execution(
367
367
  results=[Result.from_e2b_result(res) for res in exec.results],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.97
3
+ Version: 0.2.99
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -2,28 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9Ks,135
3
3
  vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
4
4
  vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
5
- vision_agent/agent/vision_agent.py,sha256=i_rNpc7faqHTifp2c9sQE4Js3qYUKuJeiqauTp90OlE,8417
6
- vision_agent/agent/vision_agent_coder.py,sha256=M8J5xE9uX8Nig1WmVmwLMeCSe0E6Bg3Mo5fPPcW_a-c,30246
5
+ vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
6
+ vision_agent/agent/vision_agent_coder.py,sha256=N8oVwfxrz6emHlucJC5hGQvkA9cQWW2sMLFtshwLdI8,30309
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
9
+ vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ vision_agent/clients/http.py,sha256=1WMt29F12YFfPH03AttKxnUNXx5sNOD9ZuH4etbB054,1598
11
+ vision_agent/clients/landing_public_api.py,sha256=Tjl8uBZWc3dvrCOKg-PCYjw3RC3X5Y6B50kaKn_QzL0,1050
9
12
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
13
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
11
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
12
15
  vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
13
16
  vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
14
- vision_agent/tools/__init__.py,sha256=UNiaJAOt1C709gaJ-a9h9BzKnY5JmoEUpgKftsOnyPQ,1882
15
- vision_agent/tools/meta_tools.py,sha256=rmxgVzj-vJKeewHbue3qHru4sYsFLxlSZV-YH-eyH5w,13366
17
+ vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
18
+ vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
19
+ vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
16
20
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
17
21
  vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
18
- vision_agent/tools/tools.py,sha256=fHD4qhn7cGG1O77J_BHfaRfW6LMQuj1OIu9xqYu6AG8,43220
22
+ vision_agent/tools/tools.py,sha256=aYo0xSbdr-Q4gq_dKxa8yLyczmXoKv_vYYrZ7dM38bw,43219
19
23
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
20
24
  vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
21
- vision_agent/utils/execute.py,sha256=s43aUtuq7ZNjil2mxrddiz8EvvqlJwttkYlIiZouXqM,25125
25
+ vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
22
26
  vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
23
27
  vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
24
28
  vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
25
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
26
- vision_agent-0.2.97.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
- vision_agent-0.2.97.dist-info/METADATA,sha256=00md0PT29fBJuyXl2LeWcrC3l5T6FXn85YE6Kmat60Q,10728
28
- vision_agent-0.2.97.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
- vision_agent-0.2.97.dist-info/RECORD,,
30
+ vision_agent-0.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.99.dist-info/METADATA,sha256=QDiN7-jSVTpGtrwJLhvSUM1A7aj1baWhZ9eFf1GVn2E,10728
32
+ vision_agent-0.2.99.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.99.dist-info/RECORD,,