vision-agent 0.2.110__py3-none-any.whl → 0.2.112__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/agent_utils.py +3 -8
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +28 -20
- vision_agent/agent/vision_agent_coder_prompts.py +9 -7
- vision_agent/agent/vision_agent_prompts.py +11 -10
- vision_agent/clients/http.py +15 -3
- vision_agent/clients/landing_public_api.py +14 -2
- vision_agent/tools/__init__.py +11 -5
- vision_agent/tools/meta_tools.py +1 -46
- vision_agent/tools/tool_utils.py +25 -10
- vision_agent/tools/tools.py +463 -99
- vision_agent/tools/tools_types.py +84 -0
- vision_agent/utils/exceptions.py +13 -0
- vision_agent/utils/execute.py +0 -1
- vision_agent/utils/image_utils.py +52 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/METADATA +1 -1
- vision_agent-0.2.112.dist-info/RECORD +33 -0
- vision_agent/tools/meta_tools_types.py +0 -30
- vision_agent-0.2.110.dist-info/RECORD +0 -33
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/WHEEL +0 -0
@@ -4,14 +4,13 @@ import sys
|
|
4
4
|
from typing import Any, Dict
|
5
5
|
|
6
6
|
logging.basicConfig(stream=sys.stdout)
|
7
|
-
_LOGGER = logging.getLogger(__name__)
|
8
7
|
|
9
8
|
|
10
9
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
11
10
|
try:
|
11
|
+
json_str = json_str.replace("\n", " ")
|
12
12
|
json_dict = json.loads(json_str)
|
13
13
|
except json.JSONDecodeError:
|
14
|
-
input_json_str = json_str
|
15
14
|
if "```json" in json_str:
|
16
15
|
json_str = json_str[json_str.find("```json") + len("```json") :]
|
17
16
|
json_str = json_str[: json_str.find("```")]
|
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
19
18
|
json_str = json_str[json_str.find("```") + len("```") :]
|
20
19
|
# get the last ``` not one from an intermediate string
|
21
20
|
json_str = json_str[: json_str.find("}```")]
|
22
|
-
|
23
|
-
|
24
|
-
except json.JSONDecodeError as e:
|
25
|
-
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
26
|
-
_LOGGER.exception(error_msg)
|
27
|
-
raise ValueError(error_msg) from e
|
21
|
+
|
22
|
+
json_dict = json.loads(json_str)
|
28
23
|
return json_dict # type: ignore
|
29
24
|
|
30
25
|
|
@@ -28,7 +28,7 @@ class DefaultImports:
|
|
28
28
|
code = [
|
29
29
|
"from typing import *",
|
30
30
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
32
32
|
]
|
33
33
|
|
34
34
|
@staticmethod
|
@@ -4,6 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
+
from json import JSONDecodeError
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
10
|
|
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
|
|
86
87
|
def format_plans(plans: Dict[str, Any]) -> str:
|
87
88
|
plan_str = ""
|
88
89
|
for k, v in plans.items():
|
89
|
-
plan_str += f"{k}
|
90
|
-
plan_str += "-" + "\n-".join([e
|
90
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
91
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
91
92
|
|
92
93
|
return plan_str
|
93
94
|
|
@@ -228,13 +229,11 @@ def pick_plan(
|
|
228
229
|
"status": "completed" if tool_output.success else "failed",
|
229
230
|
}
|
230
231
|
)
|
231
|
-
tool_output_str =
|
232
|
-
if len(tool_output.logs.stdout) > 0:
|
233
|
-
tool_output_str = tool_output.logs.stdout[0]
|
232
|
+
tool_output_str = tool_output.text().strip()
|
234
233
|
|
235
234
|
if verbosity == 2:
|
236
235
|
_print_code("Code and test after attempted fix:", code)
|
237
|
-
_LOGGER.info(f"Code execution result after
|
236
|
+
_LOGGER.info(f"Code execution result after attempt {count}")
|
238
237
|
|
239
238
|
count += 1
|
240
239
|
|
@@ -251,7 +250,21 @@ def pick_plan(
|
|
251
250
|
tool_output=tool_output_str[:20_000],
|
252
251
|
)
|
253
252
|
chat[-1]["content"] = prompt
|
254
|
-
|
253
|
+
|
254
|
+
count = 0
|
255
|
+
best_plan = None
|
256
|
+
while best_plan is None and count < max_retries:
|
257
|
+
try:
|
258
|
+
best_plan = extract_json(model(chat, stream=False)) # type: ignore
|
259
|
+
except JSONDecodeError as e:
|
260
|
+
_LOGGER.exception(
|
261
|
+
f"Error while extracting JSON during picking best plan {str(e)}"
|
262
|
+
)
|
263
|
+
pass
|
264
|
+
count += 1
|
265
|
+
|
266
|
+
if best_plan is None:
|
267
|
+
best_plan = {"best_plan": list(plans.keys())[0]}
|
255
268
|
|
256
269
|
if verbosity >= 1:
|
257
270
|
_LOGGER.info(f"Best plan:\n{best_plan}")
|
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
|
525
538
|
|
526
539
|
|
527
540
|
def retrieve_tools(
|
528
|
-
plans: Dict[str,
|
541
|
+
plans: Dict[str, Dict[str, Any]],
|
529
542
|
tool_recommender: Sim,
|
530
543
|
log_progress: Callable[[Dict[str, Any]], None],
|
531
544
|
verbosity: int = 0,
|
@@ -542,8 +555,8 @@ def retrieve_tools(
|
|
542
555
|
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
543
556
|
for k, plan in plans.items():
|
544
557
|
tool_lists[k] = []
|
545
|
-
for task in plan:
|
546
|
-
tools = tool_recommender.top_k(task
|
558
|
+
for task in plan["instructions"]:
|
559
|
+
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
547
560
|
tool_info.extend([e["doc"] for e in tools])
|
548
561
|
tool_desc.extend([e["desc"] for e in tools])
|
549
562
|
tool_lists[k].extend(
|
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
|
|
737
750
|
if self.verbosity >= 1:
|
738
751
|
for p in plans:
|
739
752
|
# tabulate will fail if the keys are not the same for all elements
|
740
|
-
p_fixed = [
|
741
|
-
{
|
742
|
-
"instructions": (
|
743
|
-
e["instructions"] if "instructions" in e else ""
|
744
|
-
)
|
745
|
-
}
|
746
|
-
for e in plans[p]
|
747
|
-
]
|
753
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
748
754
|
_LOGGER.info(
|
749
755
|
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
750
756
|
)
|
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
|
|
793
799
|
)
|
794
800
|
|
795
801
|
if self.verbosity >= 1:
|
802
|
+
plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
|
796
803
|
_LOGGER.info(
|
797
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
804
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
798
805
|
)
|
799
806
|
|
800
807
|
results = write_and_test_code(
|
801
808
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
802
|
-
plan="\n
|
809
|
+
plan=f"\n{plan_i['thoughts']}\n-"
|
810
|
+
+ "\n-".join([e for e in plan_i["instructions"]]),
|
803
811
|
tool_info=tool_info,
|
804
812
|
tool_output=tool_output_str,
|
805
813
|
tool_utils=T.UTILITIES_DOCSTRING,
|
@@ -30,18 +30,19 @@ PLAN = """
|
|
30
30
|
|
31
31
|
**Instructions**:
|
32
32
|
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2. Output three different plans each utilize a different strategy or
|
33
|
+
2. Output three different plans each utilize a different strategy or set of tools.
|
34
34
|
|
35
35
|
Output a list of jsons in the following format
|
36
36
|
|
37
37
|
```json
|
38
38
|
{{
|
39
39
|
"plan1":
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
{{
|
41
|
+
"thoughts": str # your thought process for choosing this plan
|
42
|
+
"instructions": [
|
43
|
+
str # what you should do in this task associated with a tool
|
44
|
+
]
|
45
|
+
}},
|
45
46
|
"plan2": ...,
|
46
47
|
"plan3": ...
|
47
48
|
}}
|
@@ -127,7 +128,8 @@ PICK_PLAN = """
|
|
127
128
|
|
128
129
|
**Instructions**:
|
129
130
|
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
|
130
|
-
2.
|
131
|
+
2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
|
132
|
+
3. Output a JSON object with the following format:
|
131
133
|
{{
|
132
134
|
"thoughts": str # your thought process for choosing the best plan
|
133
135
|
"best_plan": str # the best plan you have chosen
|
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /workspace
|
18
|
+
[Current directory: /example/workspace]
|
19
19
|
{examples}
|
20
20
|
--- END EXAMPLES ---
|
21
21
|
|
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
27
27
|
Here is the current conversation so far:
|
28
28
|
--- START CONVERSATION ---
|
29
29
|
[Current directory: {dir}]
|
30
|
+
|
30
31
|
{conversation}
|
31
32
|
"""
|
32
33
|
|
33
34
|
EXAMPLES_CODE1 = """
|
34
35
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
35
36
|
|
36
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
37
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
37
38
|
|
38
39
|
OBSERVATION:
|
39
|
-
[File /workspace/
|
40
|
+
[File /example/workspace/dog_detector.py]
|
40
41
|
0|from vision_agent.tools import load_image, owl_v2
|
41
42
|
1|def detect_dogs(image_path: str):
|
42
43
|
2| image = load_image(image_path)
|
@@ -44,7 +45,7 @@ OBSERVATION:
|
|
44
45
|
4| return dogs
|
45
46
|
[End of file]
|
46
47
|
|
47
|
-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
48
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
48
49
|
|
49
50
|
OBSERVATION:
|
50
51
|
----- stdout -----
|
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
55
56
|
|
56
57
|
USER: The the image only has one dog, can you fix this?
|
57
58
|
|
58
|
-
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/
|
59
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
59
60
|
|
60
61
|
OBSERVATION:
|
61
|
-
[File /workspace/
|
62
|
+
[File /example/workspace/dog_detector.py]
|
62
63
|
0|from vision_agent.tools import load_image, owl_v2
|
63
64
|
1|def detect_dogs(image_path: str):
|
64
65
|
2| image = load_image(image_path)
|
@@ -66,7 +67,7 @@ OBSERVATION:
|
|
66
67
|
4| return dogs
|
67
68
|
[End of file]
|
68
69
|
|
69
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
70
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
70
71
|
|
71
72
|
OBSERVATION:
|
72
73
|
----- stdout -----
|
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
|
|
82
83
|
|
83
84
|
USER: Yes you can use workers.png
|
84
85
|
|
85
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
86
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
|
86
87
|
|
87
88
|
OBSERVATION:
|
88
|
-
[File /workspace/
|
89
|
+
[File /example/workspace/code.py]
|
89
90
|
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
90
91
|
1|def count_workers_with_helmets(image_path: str):
|
91
92
|
2| image = load_image(image_path)
|
@@ -104,7 +105,7 @@ OBSERVATION:
|
|
104
105
|
15| return count
|
105
106
|
[End of file]
|
106
107
|
|
107
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/
|
108
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
|
108
109
|
|
109
110
|
OBSERVATION:
|
110
111
|
----- stdout -----
|
vision_agent/clients/http.py
CHANGED
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
|
|
4
4
|
|
5
5
|
from requests import Session
|
6
6
|
from requests.adapters import HTTPAdapter
|
7
|
-
from requests.exceptions import ConnectionError, RequestException, Timeout
|
8
7
|
|
9
8
|
_LOGGER = logging.getLogger(__name__)
|
10
9
|
|
@@ -38,9 +37,22 @@ class BaseHTTP:
|
|
38
37
|
response.raise_for_status()
|
39
38
|
result: Dict[str, Any] = response.json()
|
40
39
|
_LOGGER.info(json.dumps(result))
|
41
|
-
except (ConnectionError, Timeout, RequestException) as err:
|
42
|
-
_LOGGER.warning(f"Error: {err}.")
|
43
40
|
except json.JSONDecodeError:
|
44
41
|
resp_text = response.text
|
45
42
|
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
43
|
+
raise
|
44
|
+
return result
|
45
|
+
|
46
|
+
def get(self, url: str) -> Dict[str, Any]:
|
47
|
+
formatted_url = f"{self._base_endpoint}/{url}"
|
48
|
+
_LOGGER.info(f"Sending data to {formatted_url}")
|
49
|
+
try:
|
50
|
+
response = self._session.get(url=formatted_url, timeout=self._TIMEOUT)
|
51
|
+
response.raise_for_status()
|
52
|
+
result: Dict[str, Any] = response.json()
|
53
|
+
_LOGGER.info(json.dumps(result))
|
54
|
+
except json.JSONDecodeError:
|
55
|
+
resp_text = response.text
|
56
|
+
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
57
|
+
raise
|
46
58
|
return result
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import os
|
2
|
-
from uuid import UUID
|
3
2
|
from typing import List
|
3
|
+
from uuid import UUID
|
4
|
+
|
5
|
+
from requests.exceptions import HTTPError
|
4
6
|
|
5
7
|
from vision_agent.clients.http import BaseHTTP
|
6
8
|
from vision_agent.utils.type_defs import LandingaiAPIKey
|
7
|
-
from vision_agent.
|
9
|
+
from vision_agent.utils.exceptions import FineTuneModelNotFound
|
10
|
+
from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
|
8
11
|
|
9
12
|
|
10
13
|
class LandingPublicAPI(BaseHTTP):
|
@@ -24,3 +27,12 @@ class LandingPublicAPI(BaseHTTP):
|
|
24
27
|
}
|
25
28
|
response = self.post(url, payload=data)
|
26
29
|
return UUID(response["jobId"])
|
30
|
+
|
31
|
+
def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
|
32
|
+
url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
|
33
|
+
try:
|
34
|
+
get_job = self.get(url)
|
35
|
+
except HTTPError as err:
|
36
|
+
if err.response.status_code == 404:
|
37
|
+
raise FineTuneModelNotFound()
|
38
|
+
return JobStatus(get_job["status"])
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Callable, List, Optional
|
2
2
|
|
3
|
-
from .meta_tools import
|
3
|
+
from .meta_tools import (
|
4
|
+
META_TOOL_DOCSTRING,
|
5
|
+
)
|
4
6
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
5
7
|
from .tools import (
|
6
8
|
TOOL_DESCRIPTIONS,
|
@@ -17,16 +19,20 @@ from .tools import (
|
|
17
19
|
detr_segmentation,
|
18
20
|
dpt_hybrid_midas,
|
19
21
|
extract_frames,
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
florence2_image_caption,
|
23
|
+
florence2_object_detection,
|
24
|
+
florence2_ocr,
|
25
|
+
florence2_roberta_vqa,
|
26
|
+
florence2_sam2_image,
|
27
|
+
florence2_sam2_video,
|
24
28
|
generate_pose_image,
|
25
29
|
generate_soft_edge_image,
|
26
30
|
get_tool_documentation,
|
27
31
|
git_vqa_v2,
|
28
32
|
grounding_dino,
|
29
33
|
grounding_sam,
|
34
|
+
ixc25_image_vqa,
|
35
|
+
ixc25_video_vqa,
|
30
36
|
load_image,
|
31
37
|
loca_visual_prompt_counting,
|
32
38
|
loca_zero_shot_counting,
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import os
|
2
2
|
import subprocess
|
3
|
-
from uuid import UUID
|
4
3
|
from pathlib import Path
|
5
4
|
from typing import Any, Dict, List, Union
|
6
5
|
|
@@ -8,9 +7,6 @@ import vision_agent as va
|
|
8
7
|
from vision_agent.lmm.types import Message
|
9
8
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
10
9
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
11
|
-
from vision_agent.utils.image_utils import convert_to_b64
|
12
|
-
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
13
|
-
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
|
14
10
|
|
15
11
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
16
12
|
|
@@ -384,51 +380,11 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
|
|
384
380
|
|
385
381
|
def get_tool_descriptions() -> str:
|
386
382
|
"""Returns a description of all the tools that `generate_vision_code` has access to.
|
387
|
-
Helpful for
|
383
|
+
Helpful for answering questions about what types of vision tasks you can do with
|
388
384
|
`generate_vision_code`."""
|
389
385
|
return TOOL_DESCRIPTIONS
|
390
386
|
|
391
387
|
|
392
|
-
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
393
|
-
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
394
|
-
to detect objects in an image based on a given dataset. It returns the fine
|
395
|
-
tuning job id.
|
396
|
-
|
397
|
-
Parameters:
|
398
|
-
bboxes (List[BboxInput]): A list of BboxInput containing the
|
399
|
-
image path, labels and bounding boxes.
|
400
|
-
task (PromptTask): The florencev2 fine-tuning task. The options are
|
401
|
-
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
402
|
-
|
403
|
-
Returns:
|
404
|
-
UUID: The fine tuning job id, this id will used to retrieve the fine
|
405
|
-
tuned model.
|
406
|
-
|
407
|
-
Example
|
408
|
-
-------
|
409
|
-
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
410
|
-
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
411
|
-
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
412
|
-
"OBJECT_DETECTION"
|
413
|
-
)
|
414
|
-
"""
|
415
|
-
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
416
|
-
task_input = PromptTask[task]
|
417
|
-
fine_tuning_request = [
|
418
|
-
BboxInputBase64(
|
419
|
-
image=convert_to_b64(bbox_input.image_path),
|
420
|
-
filename=bbox_input.image_path.split("/")[-1],
|
421
|
-
labels=bbox_input.labels,
|
422
|
-
bboxes=bbox_input.bboxes,
|
423
|
-
)
|
424
|
-
for bbox_input in bboxes_input
|
425
|
-
]
|
426
|
-
landing_api = LandingPublicAPI()
|
427
|
-
return landing_api.launch_fine_tuning_job(
|
428
|
-
"florencev2", task_input, fine_tuning_request
|
429
|
-
)
|
430
|
-
|
431
|
-
|
432
388
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
433
389
|
[
|
434
390
|
get_tool_descriptions,
|
@@ -442,6 +398,5 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
442
398
|
search_dir,
|
443
399
|
search_file,
|
444
400
|
find_file,
|
445
|
-
florencev2_fine_tuning,
|
446
401
|
]
|
447
402
|
)
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Any, Callable, Dict, List, MutableMapping, Optional
|
4
|
+
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
5
5
|
|
6
6
|
import pandas as pd
|
7
7
|
from IPython.display import display
|
@@ -15,9 +15,10 @@ from vision_agent.utils.execute import Error, MimeType
|
|
15
15
|
from vision_agent.utils.type_defs import LandingaiAPIKey
|
16
16
|
|
17
17
|
_LOGGER = logging.getLogger(__name__)
|
18
|
-
_LND_API_KEY = LandingaiAPIKey().api_key
|
19
|
-
|
20
|
-
|
18
|
+
_LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
19
|
+
_LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
|
20
|
+
_LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
|
21
|
+
_LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
|
21
22
|
|
22
23
|
|
23
24
|
class ToolCallTrace(BaseModel):
|
@@ -28,8 +29,14 @@ class ToolCallTrace(BaseModel):
|
|
28
29
|
|
29
30
|
|
30
31
|
def send_inference_request(
|
31
|
-
payload: Dict[str, Any],
|
32
|
+
payload: Dict[str, Any],
|
33
|
+
endpoint_name: str,
|
34
|
+
files: Optional[List[Tuple[Any, ...]]] = None,
|
35
|
+
v2: bool = False,
|
36
|
+
metadata_payload: Optional[Dict[str, Any]] = None,
|
32
37
|
) -> Dict[str, Any]:
|
38
|
+
# TODO: runtime_tag and function_name should be metadata_payload and now included
|
39
|
+
# in the service payload
|
33
40
|
try:
|
34
41
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
35
42
|
payload["runtime_tag"] = runtime_tag
|
@@ -44,7 +51,7 @@ def send_inference_request(
|
|
44
51
|
response={},
|
45
52
|
error=None,
|
46
53
|
)
|
47
|
-
headers = {"
|
54
|
+
headers = {"apikey": _LND_API_KEY}
|
48
55
|
if "TOOL_ENDPOINT_AUTH" in os.environ:
|
49
56
|
headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
|
50
57
|
headers.pop("apikey")
|
@@ -54,7 +61,11 @@ def send_inference_request(
|
|
54
61
|
num_retry=3,
|
55
62
|
headers=headers,
|
56
63
|
)
|
57
|
-
|
64
|
+
|
65
|
+
if files is not None:
|
66
|
+
res = session.post(url, data=payload, files=files)
|
67
|
+
else:
|
68
|
+
res = session.post(url, json=payload)
|
58
69
|
if res.status_code != 200:
|
59
70
|
tool_call_trace.error = Error(
|
60
71
|
name="RemoteToolCallFailed",
|
@@ -62,9 +73,13 @@ def send_inference_request(
|
|
62
73
|
traceback_raw=[],
|
63
74
|
)
|
64
75
|
_LOGGER.error(f"Request failed: {res.status_code} {res.text}")
|
65
|
-
|
66
|
-
|
67
|
-
|
76
|
+
# TODO: function_name should be in metadata_payload
|
77
|
+
function_name = "unknown"
|
78
|
+
if "function_name" in payload:
|
79
|
+
function_name = payload["function_name"]
|
80
|
+
elif metadata_payload is not None and "function_name" in metadata_payload:
|
81
|
+
function_name = metadata_payload["function_name"]
|
82
|
+
raise RemoteToolCallFailed(function_name, res.status_code, res.text)
|
68
83
|
|
69
84
|
resp = res.json()
|
70
85
|
tool_call_trace.response = resp
|