vision-agent 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +7 -6
- vision_agent/clients/__init__.py +0 -0
- vision_agent/clients/http.py +46 -0
- vision_agent/clients/landing_public_api.py +26 -0
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +45 -0
- vision_agent/tools/meta_tools_types.py +30 -0
- vision_agent/tools/tools.py +4 -5
- vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/METADATA +1 -1
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/RECORD +14 -10
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/WHEEL +0 -0
@@ -28,7 +28,7 @@ class DefaultImports:
|
|
28
28
|
code = [
|
29
29
|
"from typing import *",
|
30
30
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
|
32
32
|
]
|
33
33
|
|
34
34
|
@staticmethod
|
@@ -93,7 +93,7 @@ def format_plans(plans: Dict[str, Any]) -> str:
|
|
93
93
|
|
94
94
|
|
95
95
|
def extract_image(
|
96
|
-
media: Optional[Sequence[Union[str, Path]]]
|
96
|
+
media: Optional[Sequence[Union[str, Path]]],
|
97
97
|
) -> Optional[Sequence[Union[str, Path]]]:
|
98
98
|
if media is None:
|
99
99
|
return None
|
@@ -186,7 +186,8 @@ def pick_plan(
|
|
186
186
|
if tool_output.success
|
187
187
|
else "Code execution failed"
|
188
188
|
),
|
189
|
-
"
|
189
|
+
"code": DefaultImports.prepend_imports(code),
|
190
|
+
# "payload": tool_output.to_json(),
|
190
191
|
"status": "completed" if tool_output.success else "failed",
|
191
192
|
}
|
192
193
|
)
|
@@ -211,6 +212,9 @@ def pick_plan(
|
|
211
212
|
}
|
212
213
|
)
|
213
214
|
code = extract_code(model(prompt))
|
215
|
+
tool_output = code_interpreter.exec_isolation(
|
216
|
+
DefaultImports.prepend_imports(code)
|
217
|
+
)
|
214
218
|
log_progress(
|
215
219
|
{
|
216
220
|
"type": "log",
|
@@ -220,13 +224,10 @@ def pick_plan(
|
|
220
224
|
else "Code execution failed"
|
221
225
|
),
|
222
226
|
"code": DefaultImports.prepend_imports(code),
|
223
|
-
"payload": tool_output.to_json(),
|
227
|
+
# "payload": tool_output.to_json(),
|
224
228
|
"status": "completed" if tool_output.success else "failed",
|
225
229
|
}
|
226
230
|
)
|
227
|
-
tool_output = code_interpreter.exec_isolation(
|
228
|
-
DefaultImports.prepend_imports(code)
|
229
|
-
)
|
230
231
|
tool_output_str = ""
|
231
232
|
if len(tool_output.logs.stdout) > 0:
|
232
233
|
tool_output_str = tool_output.logs.stdout[0]
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
from requests import Session
|
6
|
+
from requests.adapters import HTTPAdapter
|
7
|
+
from requests.exceptions import ConnectionError, RequestException, Timeout
|
8
|
+
|
9
|
+
_LOGGER = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class BaseHTTP:
|
13
|
+
_TIMEOUT = 30 # seconds
|
14
|
+
_MAX_RETRIES = 3
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
|
18
|
+
) -> None:
|
19
|
+
self._headers = headers
|
20
|
+
if headers is None:
|
21
|
+
self._headers = {
|
22
|
+
"Content-Type": "application/json",
|
23
|
+
}
|
24
|
+
self._base_endpoint = base_endpoint
|
25
|
+
self._session = Session()
|
26
|
+
self._session.headers.update(self._headers) # type: ignore
|
27
|
+
self._session.mount(
|
28
|
+
self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
|
29
|
+
)
|
30
|
+
|
31
|
+
def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
32
|
+
formatted_url = f"{self._base_endpoint}/{url}"
|
33
|
+
_LOGGER.info(f"Sending data to {formatted_url}")
|
34
|
+
try:
|
35
|
+
response = self._session.post(
|
36
|
+
url=formatted_url, json=payload, timeout=self._TIMEOUT
|
37
|
+
)
|
38
|
+
response.raise_for_status()
|
39
|
+
result: Dict[str, Any] = response.json()
|
40
|
+
_LOGGER.info(json.dumps(result))
|
41
|
+
except (ConnectionError, Timeout, RequestException) as err:
|
42
|
+
_LOGGER.warning(f"Error: {err}.")
|
43
|
+
except json.JSONDecodeError:
|
44
|
+
resp_text = response.text
|
45
|
+
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
46
|
+
return result
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import os
|
2
|
+
from uuid import UUID
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from vision_agent.clients.http import BaseHTTP
|
6
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
7
|
+
from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
|
8
|
+
|
9
|
+
|
10
|
+
class LandingPublicAPI(BaseHTTP):
|
11
|
+
def __init__(self) -> None:
|
12
|
+
landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
|
13
|
+
landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
14
|
+
headers = {"Content-Type": "application/json", "apikey": landing_api_key}
|
15
|
+
super().__init__(base_endpoint=landing_url, headers=headers)
|
16
|
+
|
17
|
+
def launch_fine_tuning_job(
|
18
|
+
self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
|
19
|
+
) -> UUID:
|
20
|
+
url = "v1/agent/jobs/fine-tuning"
|
21
|
+
data = {
|
22
|
+
"model": {"name": model_name, "task": task.value},
|
23
|
+
"bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
|
24
|
+
}
|
25
|
+
response = self.post(url, payload=data)
|
26
|
+
return UUID(response["jobId"])
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import subprocess
|
3
|
+
from uuid import UUID
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, Dict, List, Union
|
5
6
|
|
@@ -7,6 +8,9 @@ import vision_agent as va
|
|
7
8
|
from vision_agent.lmm.types import Message
|
8
9
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
9
10
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
11
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
12
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
13
|
+
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
|
10
14
|
|
11
15
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
12
16
|
|
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
|
|
385
389
|
return TOOL_DESCRIPTIONS
|
386
390
|
|
387
391
|
|
392
|
+
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
393
|
+
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
394
|
+
to detect objects in an image based on a given dataset. It returns the fine
|
395
|
+
tuning job id.
|
396
|
+
|
397
|
+
Parameters:
|
398
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
399
|
+
image path, labels and bounding boxes.
|
400
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
401
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
402
|
+
|
403
|
+
Returns:
|
404
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
405
|
+
tuned model.
|
406
|
+
|
407
|
+
Example
|
408
|
+
-------
|
409
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
410
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
411
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
412
|
+
"OBJECT_DETECTION"
|
413
|
+
)
|
414
|
+
"""
|
415
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
416
|
+
task_input = PromptTask[task]
|
417
|
+
fine_tuning_request = [
|
418
|
+
BboxInputBase64(
|
419
|
+
image=convert_to_b64(bbox_input.image_path),
|
420
|
+
filename=bbox_input.image_path.split("/")[-1],
|
421
|
+
labels=bbox_input.labels,
|
422
|
+
bboxes=bbox_input.bboxes,
|
423
|
+
)
|
424
|
+
for bbox_input in bboxes_input
|
425
|
+
]
|
426
|
+
landing_api = LandingPublicAPI()
|
427
|
+
return landing_api.launch_fine_tuning_job(
|
428
|
+
"florencev2", task_input, fine_tuning_request
|
429
|
+
)
|
430
|
+
|
431
|
+
|
388
432
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
389
433
|
[
|
390
434
|
get_tool_descriptions,
|
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
398
442
|
search_dir,
|
399
443
|
search_file,
|
400
444
|
find_file,
|
445
|
+
florencev2_fine_tuning,
|
401
446
|
]
|
402
447
|
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import List, Tuple
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
|
7
|
+
class BboxInput(BaseModel):
|
8
|
+
image_path: str
|
9
|
+
labels: List[str]
|
10
|
+
bboxes: List[Tuple[int, int, int, int]]
|
11
|
+
|
12
|
+
|
13
|
+
class BboxInputBase64(BaseModel):
|
14
|
+
image: str
|
15
|
+
filename: str
|
16
|
+
labels: List[str]
|
17
|
+
bboxes: List[Tuple[int, int, int, int]]
|
18
|
+
|
19
|
+
|
20
|
+
class PromptTask(str, Enum):
|
21
|
+
"""
|
22
|
+
Valid task prompts options for the Florencev2 model.
|
23
|
+
"""
|
24
|
+
|
25
|
+
CAPTION = "<CAPTION>"
|
26
|
+
""""""
|
27
|
+
CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
28
|
+
""""""
|
29
|
+
OBJECT_DETECTION = "<OD>"
|
30
|
+
""""""
|
vision_agent/tools/tools.py
CHANGED
@@ -2,23 +2,23 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from importlib import resources
|
6
5
|
from pathlib import Path
|
6
|
+
from importlib import resources
|
7
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
8
|
|
9
9
|
import cv2
|
10
|
-
import numpy as np
|
11
10
|
import requests
|
11
|
+
import numpy as np
|
12
|
+
from pytube import YouTube # type: ignore
|
12
13
|
from moviepy.editor import ImageSequenceClip
|
13
14
|
from PIL import Image, ImageDraw, ImageFont
|
14
15
|
from pillow_heif import register_heif_opener # type: ignore
|
15
|
-
from pytube import YouTube # type: ignore
|
16
16
|
|
17
17
|
from vision_agent.tools.tool_utils import (
|
18
|
+
send_inference_request,
|
18
19
|
get_tool_descriptions,
|
19
20
|
get_tool_documentation,
|
20
21
|
get_tools_df,
|
21
|
-
send_inference_request,
|
22
22
|
)
|
23
23
|
from vision_agent.utils import extract_frames_from_video
|
24
24
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -1063,7 +1063,6 @@ def save_video(
|
|
1063
1063
|
if fps <= 0:
|
1064
1064
|
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
1065
1065
|
fps = 4
|
1066
|
-
|
1067
1066
|
with ImageSequenceClip(frames, fps=fps) as video:
|
1068
1067
|
if output_video_path:
|
1069
1068
|
f = open(output_video_path, "wb")
|
vision_agent/utils/execute.py
CHANGED
@@ -209,7 +209,7 @@ class Result:
|
|
209
209
|
return formats
|
210
210
|
|
211
211
|
@staticmethod
|
212
|
-
def from_e2b_result(result: E2BResult) -> "Result":
|
212
|
+
def from_e2b_result(result: E2BResult) -> "Result":
|
213
213
|
"""
|
214
214
|
Creates a Result object from an E2BResult object.
|
215
215
|
"""
|
@@ -361,7 +361,7 @@ class Execution(BaseModel):
|
|
361
361
|
)
|
362
362
|
|
363
363
|
@staticmethod
|
364
|
-
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
364
|
+
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
365
365
|
"""Creates an Execution object from an E2BResult object."""
|
366
366
|
return Execution(
|
367
367
|
results=[Result.from_e2b_result(res) for res in exec.results],
|
@@ -2,28 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9Ks,135
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=N8oVwfxrz6emHlucJC5hGQvkA9cQWW2sMLFtshwLdI8,30309
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
|
9
|
+
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
vision_agent/clients/http.py,sha256=1WMt29F12YFfPH03AttKxnUNXx5sNOD9ZuH4etbB054,1598
|
11
|
+
vision_agent/clients/landing_public_api.py,sha256=Tjl8uBZWc3dvrCOKg-PCYjw3RC3X5Y6B50kaKn_QzL0,1050
|
9
12
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
13
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
11
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
12
15
|
vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
|
13
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
14
|
-
vision_agent/tools/__init__.py,sha256=
|
15
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
|
+
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
16
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
17
21
|
vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
|
18
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
+
vision_agent/tools/tools.py,sha256=aYo0xSbdr-Q4gq_dKxa8yLyczmXoKv_vYYrZ7dM38bw,43219
|
19
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
20
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
21
|
-
vision_agent/utils/execute.py,sha256=
|
25
|
+
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
22
26
|
vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
|
23
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
24
28
|
vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
|
25
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
26
|
-
vision_agent-0.2.
|
27
|
-
vision_agent-0.2.
|
28
|
-
vision_agent-0.2.
|
29
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.99.dist-info/METADATA,sha256=QDiN7-jSVTpGtrwJLhvSUM1A7aj1baWhZ9eFf1GVn2E,10728
|
32
|
+
vision_agent-0.2.99.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.99.dist-info/RECORD,,
|
File without changes
|
File without changes
|