vision-agent 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +7 -6
- vision_agent/clients/__init__.py +0 -0
- vision_agent/clients/http.py +46 -0
- vision_agent/clients/landing_public_api.py +26 -0
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +45 -0
- vision_agent/tools/meta_tools_types.py +30 -0
- vision_agent/tools/tools.py +4 -5
- vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/METADATA +1 -1
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/RECORD +14 -10
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.97.dist-info → vision_agent-0.2.99.dist-info}/WHEEL +0 -0
@@ -28,7 +28,7 @@ class DefaultImports:
|
|
28
28
|
code = [
|
29
29
|
"from typing import *",
|
30
30
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
|
32
32
|
]
|
33
33
|
|
34
34
|
@staticmethod
|
@@ -93,7 +93,7 @@ def format_plans(plans: Dict[str, Any]) -> str:
|
|
93
93
|
|
94
94
|
|
95
95
|
def extract_image(
|
96
|
-
media: Optional[Sequence[Union[str, Path]]]
|
96
|
+
media: Optional[Sequence[Union[str, Path]]],
|
97
97
|
) -> Optional[Sequence[Union[str, Path]]]:
|
98
98
|
if media is None:
|
99
99
|
return None
|
@@ -186,7 +186,8 @@ def pick_plan(
|
|
186
186
|
if tool_output.success
|
187
187
|
else "Code execution failed"
|
188
188
|
),
|
189
|
-
"
|
189
|
+
"code": DefaultImports.prepend_imports(code),
|
190
|
+
# "payload": tool_output.to_json(),
|
190
191
|
"status": "completed" if tool_output.success else "failed",
|
191
192
|
}
|
192
193
|
)
|
@@ -211,6 +212,9 @@ def pick_plan(
|
|
211
212
|
}
|
212
213
|
)
|
213
214
|
code = extract_code(model(prompt))
|
215
|
+
tool_output = code_interpreter.exec_isolation(
|
216
|
+
DefaultImports.prepend_imports(code)
|
217
|
+
)
|
214
218
|
log_progress(
|
215
219
|
{
|
216
220
|
"type": "log",
|
@@ -220,13 +224,10 @@ def pick_plan(
|
|
220
224
|
else "Code execution failed"
|
221
225
|
),
|
222
226
|
"code": DefaultImports.prepend_imports(code),
|
223
|
-
"payload": tool_output.to_json(),
|
227
|
+
# "payload": tool_output.to_json(),
|
224
228
|
"status": "completed" if tool_output.success else "failed",
|
225
229
|
}
|
226
230
|
)
|
227
|
-
tool_output = code_interpreter.exec_isolation(
|
228
|
-
DefaultImports.prepend_imports(code)
|
229
|
-
)
|
230
231
|
tool_output_str = ""
|
231
232
|
if len(tool_output.logs.stdout) > 0:
|
232
233
|
tool_output_str = tool_output.logs.stdout[0]
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
from requests import Session
|
6
|
+
from requests.adapters import HTTPAdapter
|
7
|
+
from requests.exceptions import ConnectionError, RequestException, Timeout
|
8
|
+
|
9
|
+
_LOGGER = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class BaseHTTP:
|
13
|
+
_TIMEOUT = 30 # seconds
|
14
|
+
_MAX_RETRIES = 3
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
|
18
|
+
) -> None:
|
19
|
+
self._headers = headers
|
20
|
+
if headers is None:
|
21
|
+
self._headers = {
|
22
|
+
"Content-Type": "application/json",
|
23
|
+
}
|
24
|
+
self._base_endpoint = base_endpoint
|
25
|
+
self._session = Session()
|
26
|
+
self._session.headers.update(self._headers) # type: ignore
|
27
|
+
self._session.mount(
|
28
|
+
self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
|
29
|
+
)
|
30
|
+
|
31
|
+
def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
32
|
+
formatted_url = f"{self._base_endpoint}/{url}"
|
33
|
+
_LOGGER.info(f"Sending data to {formatted_url}")
|
34
|
+
try:
|
35
|
+
response = self._session.post(
|
36
|
+
url=formatted_url, json=payload, timeout=self._TIMEOUT
|
37
|
+
)
|
38
|
+
response.raise_for_status()
|
39
|
+
result: Dict[str, Any] = response.json()
|
40
|
+
_LOGGER.info(json.dumps(result))
|
41
|
+
except (ConnectionError, Timeout, RequestException) as err:
|
42
|
+
_LOGGER.warning(f"Error: {err}.")
|
43
|
+
except json.JSONDecodeError:
|
44
|
+
resp_text = response.text
|
45
|
+
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
46
|
+
return result
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import os
|
2
|
+
from uuid import UUID
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from vision_agent.clients.http import BaseHTTP
|
6
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
7
|
+
from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
|
8
|
+
|
9
|
+
|
10
|
+
class LandingPublicAPI(BaseHTTP):
|
11
|
+
def __init__(self) -> None:
|
12
|
+
landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
|
13
|
+
landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
14
|
+
headers = {"Content-Type": "application/json", "apikey": landing_api_key}
|
15
|
+
super().__init__(base_endpoint=landing_url, headers=headers)
|
16
|
+
|
17
|
+
def launch_fine_tuning_job(
|
18
|
+
self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
|
19
|
+
) -> UUID:
|
20
|
+
url = "v1/agent/jobs/fine-tuning"
|
21
|
+
data = {
|
22
|
+
"model": {"name": model_name, "task": task.value},
|
23
|
+
"bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
|
24
|
+
}
|
25
|
+
response = self.post(url, payload=data)
|
26
|
+
return UUID(response["jobId"])
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import subprocess
|
3
|
+
from uuid import UUID
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, Dict, List, Union
|
5
6
|
|
@@ -7,6 +8,9 @@ import vision_agent as va
|
|
7
8
|
from vision_agent.lmm.types import Message
|
8
9
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
9
10
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
11
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
12
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
13
|
+
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
|
10
14
|
|
11
15
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
12
16
|
|
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
|
|
385
389
|
return TOOL_DESCRIPTIONS
|
386
390
|
|
387
391
|
|
392
|
+
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
393
|
+
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
394
|
+
to detect objects in an image based on a given dataset. It returns the fine
|
395
|
+
tuning job id.
|
396
|
+
|
397
|
+
Parameters:
|
398
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
399
|
+
image path, labels and bounding boxes.
|
400
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
401
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
402
|
+
|
403
|
+
Returns:
|
404
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
405
|
+
tuned model.
|
406
|
+
|
407
|
+
Example
|
408
|
+
-------
|
409
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
410
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
411
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
412
|
+
"OBJECT_DETECTION"
|
413
|
+
)
|
414
|
+
"""
|
415
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
416
|
+
task_input = PromptTask[task]
|
417
|
+
fine_tuning_request = [
|
418
|
+
BboxInputBase64(
|
419
|
+
image=convert_to_b64(bbox_input.image_path),
|
420
|
+
filename=bbox_input.image_path.split("/")[-1],
|
421
|
+
labels=bbox_input.labels,
|
422
|
+
bboxes=bbox_input.bboxes,
|
423
|
+
)
|
424
|
+
for bbox_input in bboxes_input
|
425
|
+
]
|
426
|
+
landing_api = LandingPublicAPI()
|
427
|
+
return landing_api.launch_fine_tuning_job(
|
428
|
+
"florencev2", task_input, fine_tuning_request
|
429
|
+
)
|
430
|
+
|
431
|
+
|
388
432
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
389
433
|
[
|
390
434
|
get_tool_descriptions,
|
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
398
442
|
search_dir,
|
399
443
|
search_file,
|
400
444
|
find_file,
|
445
|
+
florencev2_fine_tuning,
|
401
446
|
]
|
402
447
|
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import List, Tuple
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
|
7
|
+
class BboxInput(BaseModel):
|
8
|
+
image_path: str
|
9
|
+
labels: List[str]
|
10
|
+
bboxes: List[Tuple[int, int, int, int]]
|
11
|
+
|
12
|
+
|
13
|
+
class BboxInputBase64(BaseModel):
|
14
|
+
image: str
|
15
|
+
filename: str
|
16
|
+
labels: List[str]
|
17
|
+
bboxes: List[Tuple[int, int, int, int]]
|
18
|
+
|
19
|
+
|
20
|
+
class PromptTask(str, Enum):
|
21
|
+
"""
|
22
|
+
Valid task prompts options for the Florencev2 model.
|
23
|
+
"""
|
24
|
+
|
25
|
+
CAPTION = "<CAPTION>"
|
26
|
+
""""""
|
27
|
+
CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
28
|
+
""""""
|
29
|
+
OBJECT_DETECTION = "<OD>"
|
30
|
+
""""""
|
vision_agent/tools/tools.py
CHANGED
@@ -2,23 +2,23 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from importlib import resources
|
6
5
|
from pathlib import Path
|
6
|
+
from importlib import resources
|
7
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
8
|
|
9
9
|
import cv2
|
10
|
-
import numpy as np
|
11
10
|
import requests
|
11
|
+
import numpy as np
|
12
|
+
from pytube import YouTube # type: ignore
|
12
13
|
from moviepy.editor import ImageSequenceClip
|
13
14
|
from PIL import Image, ImageDraw, ImageFont
|
14
15
|
from pillow_heif import register_heif_opener # type: ignore
|
15
|
-
from pytube import YouTube # type: ignore
|
16
16
|
|
17
17
|
from vision_agent.tools.tool_utils import (
|
18
|
+
send_inference_request,
|
18
19
|
get_tool_descriptions,
|
19
20
|
get_tool_documentation,
|
20
21
|
get_tools_df,
|
21
|
-
send_inference_request,
|
22
22
|
)
|
23
23
|
from vision_agent.utils import extract_frames_from_video
|
24
24
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -1063,7 +1063,6 @@ def save_video(
|
|
1063
1063
|
if fps <= 0:
|
1064
1064
|
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
1065
1065
|
fps = 4
|
1066
|
-
|
1067
1066
|
with ImageSequenceClip(frames, fps=fps) as video:
|
1068
1067
|
if output_video_path:
|
1069
1068
|
f = open(output_video_path, "wb")
|
vision_agent/utils/execute.py
CHANGED
@@ -209,7 +209,7 @@ class Result:
|
|
209
209
|
return formats
|
210
210
|
|
211
211
|
@staticmethod
|
212
|
-
def from_e2b_result(result: E2BResult) -> "Result":
|
212
|
+
def from_e2b_result(result: E2BResult) -> "Result":
|
213
213
|
"""
|
214
214
|
Creates a Result object from an E2BResult object.
|
215
215
|
"""
|
@@ -361,7 +361,7 @@ class Execution(BaseModel):
|
|
361
361
|
)
|
362
362
|
|
363
363
|
@staticmethod
|
364
|
-
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
364
|
+
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
365
365
|
"""Creates an Execution object from an E2BResult object."""
|
366
366
|
return Execution(
|
367
367
|
results=[Result.from_e2b_result(res) for res in exec.results],
|
@@ -2,28 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9Ks,135
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=N8oVwfxrz6emHlucJC5hGQvkA9cQWW2sMLFtshwLdI8,30309
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
|
9
|
+
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
vision_agent/clients/http.py,sha256=1WMt29F12YFfPH03AttKxnUNXx5sNOD9ZuH4etbB054,1598
|
11
|
+
vision_agent/clients/landing_public_api.py,sha256=Tjl8uBZWc3dvrCOKg-PCYjw3RC3X5Y6B50kaKn_QzL0,1050
|
9
12
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
13
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
11
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
12
15
|
vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
|
13
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
14
|
-
vision_agent/tools/__init__.py,sha256=
|
15
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
|
+
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
16
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
17
21
|
vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
|
18
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
+
vision_agent/tools/tools.py,sha256=aYo0xSbdr-Q4gq_dKxa8yLyczmXoKv_vYYrZ7dM38bw,43219
|
19
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
20
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
21
|
-
vision_agent/utils/execute.py,sha256=
|
25
|
+
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
22
26
|
vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
|
23
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
24
28
|
vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
|
25
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
26
|
-
vision_agent-0.2.
|
27
|
-
vision_agent-0.2.
|
28
|
-
vision_agent-0.2.
|
29
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.99.dist-info/METADATA,sha256=QDiN7-jSVTpGtrwJLhvSUM1A7aj1baWhZ9eFf1GVn2E,10728
|
32
|
+
vision_agent-0.2.99.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.99.dist-info/RECORD,,
|
File without changes
|
File without changes
|