vision-agent 0.2.110__py3-none-any.whl → 0.2.112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_utils.py +3 -8
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +28 -20
- vision_agent/agent/vision_agent_coder_prompts.py +9 -7
- vision_agent/agent/vision_agent_prompts.py +11 -10
- vision_agent/clients/http.py +15 -3
- vision_agent/clients/landing_public_api.py +14 -2
- vision_agent/tools/__init__.py +11 -5
- vision_agent/tools/meta_tools.py +1 -46
- vision_agent/tools/tool_utils.py +25 -10
- vision_agent/tools/tools.py +463 -99
- vision_agent/tools/tools_types.py +84 -0
- vision_agent/utils/exceptions.py +13 -0
- vision_agent/utils/execute.py +0 -1
- vision_agent/utils/image_utils.py +52 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/METADATA +1 -1
- vision_agent-0.2.112.dist-info/RECORD +33 -0
- vision_agent/tools/meta_tools_types.py +0 -30
- vision_agent-0.2.110.dist-info/RECORD +0 -33
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.110.dist-info → vision_agent-0.2.112.dist-info}/WHEEL +0 -0
@@ -4,14 +4,13 @@ import sys
|
|
4
4
|
from typing import Any, Dict
|
5
5
|
|
6
6
|
logging.basicConfig(stream=sys.stdout)
|
7
|
-
_LOGGER = logging.getLogger(__name__)
|
8
7
|
|
9
8
|
|
10
9
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
11
10
|
try:
|
11
|
+
json_str = json_str.replace("\n", " ")
|
12
12
|
json_dict = json.loads(json_str)
|
13
13
|
except json.JSONDecodeError:
|
14
|
-
input_json_str = json_str
|
15
14
|
if "```json" in json_str:
|
16
15
|
json_str = json_str[json_str.find("```json") + len("```json") :]
|
17
16
|
json_str = json_str[: json_str.find("```")]
|
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
19
18
|
json_str = json_str[json_str.find("```") + len("```") :]
|
20
19
|
# get the last ``` not one from an intermediate string
|
21
20
|
json_str = json_str[: json_str.find("}```")]
|
22
|
-
|
23
|
-
|
24
|
-
except json.JSONDecodeError as e:
|
25
|
-
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
26
|
-
_LOGGER.exception(error_msg)
|
27
|
-
raise ValueError(error_msg) from e
|
21
|
+
|
22
|
+
json_dict = json.loads(json_str)
|
28
23
|
return json_dict # type: ignore
|
29
24
|
|
30
25
|
|
@@ -28,7 +28,7 @@ class DefaultImports:
|
|
28
28
|
code = [
|
29
29
|
"from typing import *",
|
30
30
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
32
32
|
]
|
33
33
|
|
34
34
|
@staticmethod
|
@@ -4,6 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
+
from json import JSONDecodeError
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
10
|
|
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
|
|
86
87
|
def format_plans(plans: Dict[str, Any]) -> str:
|
87
88
|
plan_str = ""
|
88
89
|
for k, v in plans.items():
|
89
|
-
plan_str += f"{k}
|
90
|
-
plan_str += "-" + "\n-".join([e
|
90
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
91
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
91
92
|
|
92
93
|
return plan_str
|
93
94
|
|
@@ -228,13 +229,11 @@ def pick_plan(
|
|
228
229
|
"status": "completed" if tool_output.success else "failed",
|
229
230
|
}
|
230
231
|
)
|
231
|
-
tool_output_str =
|
232
|
-
if len(tool_output.logs.stdout) > 0:
|
233
|
-
tool_output_str = tool_output.logs.stdout[0]
|
232
|
+
tool_output_str = tool_output.text().strip()
|
234
233
|
|
235
234
|
if verbosity == 2:
|
236
235
|
_print_code("Code and test after attempted fix:", code)
|
237
|
-
_LOGGER.info(f"Code execution result after
|
236
|
+
_LOGGER.info(f"Code execution result after attempt {count}")
|
238
237
|
|
239
238
|
count += 1
|
240
239
|
|
@@ -251,7 +250,21 @@ def pick_plan(
|
|
251
250
|
tool_output=tool_output_str[:20_000],
|
252
251
|
)
|
253
252
|
chat[-1]["content"] = prompt
|
254
|
-
|
253
|
+
|
254
|
+
count = 0
|
255
|
+
best_plan = None
|
256
|
+
while best_plan is None and count < max_retries:
|
257
|
+
try:
|
258
|
+
best_plan = extract_json(model(chat, stream=False)) # type: ignore
|
259
|
+
except JSONDecodeError as e:
|
260
|
+
_LOGGER.exception(
|
261
|
+
f"Error while extracting JSON during picking best plan {str(e)}"
|
262
|
+
)
|
263
|
+
pass
|
264
|
+
count += 1
|
265
|
+
|
266
|
+
if best_plan is None:
|
267
|
+
best_plan = {"best_plan": list(plans.keys())[0]}
|
255
268
|
|
256
269
|
if verbosity >= 1:
|
257
270
|
_LOGGER.info(f"Best plan:\n{best_plan}")
|
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
|
525
538
|
|
526
539
|
|
527
540
|
def retrieve_tools(
|
528
|
-
plans: Dict[str,
|
541
|
+
plans: Dict[str, Dict[str, Any]],
|
529
542
|
tool_recommender: Sim,
|
530
543
|
log_progress: Callable[[Dict[str, Any]], None],
|
531
544
|
verbosity: int = 0,
|
@@ -542,8 +555,8 @@ def retrieve_tools(
|
|
542
555
|
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
543
556
|
for k, plan in plans.items():
|
544
557
|
tool_lists[k] = []
|
545
|
-
for task in plan:
|
546
|
-
tools = tool_recommender.top_k(task
|
558
|
+
for task in plan["instructions"]:
|
559
|
+
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
547
560
|
tool_info.extend([e["doc"] for e in tools])
|
548
561
|
tool_desc.extend([e["desc"] for e in tools])
|
549
562
|
tool_lists[k].extend(
|
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
|
|
737
750
|
if self.verbosity >= 1:
|
738
751
|
for p in plans:
|
739
752
|
# tabulate will fail if the keys are not the same for all elements
|
740
|
-
p_fixed = [
|
741
|
-
{
|
742
|
-
"instructions": (
|
743
|
-
e["instructions"] if "instructions" in e else ""
|
744
|
-
)
|
745
|
-
}
|
746
|
-
for e in plans[p]
|
747
|
-
]
|
753
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
748
754
|
_LOGGER.info(
|
749
755
|
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
750
756
|
)
|
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
|
|
793
799
|
)
|
794
800
|
|
795
801
|
if self.verbosity >= 1:
|
802
|
+
plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
|
796
803
|
_LOGGER.info(
|
797
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
804
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
798
805
|
)
|
799
806
|
|
800
807
|
results = write_and_test_code(
|
801
808
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
802
|
-
plan="\n
|
809
|
+
plan=f"\n{plan_i['thoughts']}\n-"
|
810
|
+
+ "\n-".join([e for e in plan_i["instructions"]]),
|
803
811
|
tool_info=tool_info,
|
804
812
|
tool_output=tool_output_str,
|
805
813
|
tool_utils=T.UTILITIES_DOCSTRING,
|
@@ -30,18 +30,19 @@ PLAN = """
|
|
30
30
|
|
31
31
|
**Instructions**:
|
32
32
|
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2. Output three different plans each utilize a different strategy or
|
33
|
+
2. Output three different plans each utilize a different strategy or set of tools.
|
34
34
|
|
35
35
|
Output a list of jsons in the following format
|
36
36
|
|
37
37
|
```json
|
38
38
|
{{
|
39
39
|
"plan1":
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
{{
|
41
|
+
"thoughts": str # your thought process for choosing this plan
|
42
|
+
"instructions": [
|
43
|
+
str # what you should do in this task associated with a tool
|
44
|
+
]
|
45
|
+
}},
|
45
46
|
"plan2": ...,
|
46
47
|
"plan3": ...
|
47
48
|
}}
|
@@ -127,7 +128,8 @@ PICK_PLAN = """
|
|
127
128
|
|
128
129
|
**Instructions**:
|
129
130
|
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
|
130
|
-
2.
|
131
|
+
2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
|
132
|
+
3. Output a JSON object with the following format:
|
131
133
|
{{
|
132
134
|
"thoughts": str # your thought process for choosing the best plan
|
133
135
|
"best_plan": str # the best plan you have chosen
|
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /workspace
|
18
|
+
[Current directory: /example/workspace]
|
19
19
|
{examples}
|
20
20
|
--- END EXAMPLES ---
|
21
21
|
|
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
27
27
|
Here is the current conversation so far:
|
28
28
|
--- START CONVERSATION ---
|
29
29
|
[Current directory: {dir}]
|
30
|
+
|
30
31
|
{conversation}
|
31
32
|
"""
|
32
33
|
|
33
34
|
EXAMPLES_CODE1 = """
|
34
35
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
35
36
|
|
36
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
37
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
37
38
|
|
38
39
|
OBSERVATION:
|
39
|
-
[File /workspace/
|
40
|
+
[File /example/workspace/dog_detector.py]
|
40
41
|
0|from vision_agent.tools import load_image, owl_v2
|
41
42
|
1|def detect_dogs(image_path: str):
|
42
43
|
2| image = load_image(image_path)
|
@@ -44,7 +45,7 @@ OBSERVATION:
|
|
44
45
|
4| return dogs
|
45
46
|
[End of file]
|
46
47
|
|
47
|
-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
48
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
48
49
|
|
49
50
|
OBSERVATION:
|
50
51
|
----- stdout -----
|
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
55
56
|
|
56
57
|
USER: The the image only has one dog, can you fix this?
|
57
58
|
|
58
|
-
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/
|
59
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
59
60
|
|
60
61
|
OBSERVATION:
|
61
|
-
[File /workspace/
|
62
|
+
[File /example/workspace/dog_detector.py]
|
62
63
|
0|from vision_agent.tools import load_image, owl_v2
|
63
64
|
1|def detect_dogs(image_path: str):
|
64
65
|
2| image = load_image(image_path)
|
@@ -66,7 +67,7 @@ OBSERVATION:
|
|
66
67
|
4| return dogs
|
67
68
|
[End of file]
|
68
69
|
|
69
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
70
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
70
71
|
|
71
72
|
OBSERVATION:
|
72
73
|
----- stdout -----
|
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
|
|
82
83
|
|
83
84
|
USER: Yes you can use workers.png
|
84
85
|
|
85
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
86
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
|
86
87
|
|
87
88
|
OBSERVATION:
|
88
|
-
[File /workspace/
|
89
|
+
[File /example/workspace/code.py]
|
89
90
|
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
90
91
|
1|def count_workers_with_helmets(image_path: str):
|
91
92
|
2| image = load_image(image_path)
|
@@ -104,7 +105,7 @@ OBSERVATION:
|
|
104
105
|
15| return count
|
105
106
|
[End of file]
|
106
107
|
|
107
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/
|
108
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
|
108
109
|
|
109
110
|
OBSERVATION:
|
110
111
|
----- stdout -----
|
vision_agent/clients/http.py
CHANGED
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
|
|
4
4
|
|
5
5
|
from requests import Session
|
6
6
|
from requests.adapters import HTTPAdapter
|
7
|
-
from requests.exceptions import ConnectionError, RequestException, Timeout
|
8
7
|
|
9
8
|
_LOGGER = logging.getLogger(__name__)
|
10
9
|
|
@@ -38,9 +37,22 @@ class BaseHTTP:
|
|
38
37
|
response.raise_for_status()
|
39
38
|
result: Dict[str, Any] = response.json()
|
40
39
|
_LOGGER.info(json.dumps(result))
|
41
|
-
except (ConnectionError, Timeout, RequestException) as err:
|
42
|
-
_LOGGER.warning(f"Error: {err}.")
|
43
40
|
except json.JSONDecodeError:
|
44
41
|
resp_text = response.text
|
45
42
|
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
43
|
+
raise
|
44
|
+
return result
|
45
|
+
|
46
|
+
def get(self, url: str) -> Dict[str, Any]:
|
47
|
+
formatted_url = f"{self._base_endpoint}/{url}"
|
48
|
+
_LOGGER.info(f"Sending data to {formatted_url}")
|
49
|
+
try:
|
50
|
+
response = self._session.get(url=formatted_url, timeout=self._TIMEOUT)
|
51
|
+
response.raise_for_status()
|
52
|
+
result: Dict[str, Any] = response.json()
|
53
|
+
_LOGGER.info(json.dumps(result))
|
54
|
+
except json.JSONDecodeError:
|
55
|
+
resp_text = response.text
|
56
|
+
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
57
|
+
raise
|
46
58
|
return result
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import os
|
2
|
-
from uuid import UUID
|
3
2
|
from typing import List
|
3
|
+
from uuid import UUID
|
4
|
+
|
5
|
+
from requests.exceptions import HTTPError
|
4
6
|
|
5
7
|
from vision_agent.clients.http import BaseHTTP
|
6
8
|
from vision_agent.utils.type_defs import LandingaiAPIKey
|
7
|
-
from vision_agent.
|
9
|
+
from vision_agent.utils.exceptions import FineTuneModelNotFound
|
10
|
+
from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
|
8
11
|
|
9
12
|
|
10
13
|
class LandingPublicAPI(BaseHTTP):
|
@@ -24,3 +27,12 @@ class LandingPublicAPI(BaseHTTP):
|
|
24
27
|
}
|
25
28
|
response = self.post(url, payload=data)
|
26
29
|
return UUID(response["jobId"])
|
30
|
+
|
31
|
+
def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
|
32
|
+
url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
|
33
|
+
try:
|
34
|
+
get_job = self.get(url)
|
35
|
+
except HTTPError as err:
|
36
|
+
if err.response.status_code == 404:
|
37
|
+
raise FineTuneModelNotFound()
|
38
|
+
return JobStatus(get_job["status"])
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import Callable, List, Optional
|
2
2
|
|
3
|
-
from .meta_tools import
|
3
|
+
from .meta_tools import (
|
4
|
+
META_TOOL_DOCSTRING,
|
5
|
+
)
|
4
6
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
5
7
|
from .tools import (
|
6
8
|
TOOL_DESCRIPTIONS,
|
@@ -17,16 +19,20 @@ from .tools import (
|
|
17
19
|
detr_segmentation,
|
18
20
|
dpt_hybrid_midas,
|
19
21
|
extract_frames,
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
22
|
+
florence2_image_caption,
|
23
|
+
florence2_object_detection,
|
24
|
+
florence2_ocr,
|
25
|
+
florence2_roberta_vqa,
|
26
|
+
florence2_sam2_image,
|
27
|
+
florence2_sam2_video,
|
24
28
|
generate_pose_image,
|
25
29
|
generate_soft_edge_image,
|
26
30
|
get_tool_documentation,
|
27
31
|
git_vqa_v2,
|
28
32
|
grounding_dino,
|
29
33
|
grounding_sam,
|
34
|
+
ixc25_image_vqa,
|
35
|
+
ixc25_video_vqa,
|
30
36
|
load_image,
|
31
37
|
loca_visual_prompt_counting,
|
32
38
|
loca_zero_shot_counting,
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import os
|
2
2
|
import subprocess
|
3
|
-
from uuid import UUID
|
4
3
|
from pathlib import Path
|
5
4
|
from typing import Any, Dict, List, Union
|
6
5
|
|
@@ -8,9 +7,6 @@ import vision_agent as va
|
|
8
7
|
from vision_agent.lmm.types import Message
|
9
8
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
10
9
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
11
|
-
from vision_agent.utils.image_utils import convert_to_b64
|
12
|
-
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
13
|
-
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
|
14
10
|
|
15
11
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
16
12
|
|
@@ -384,51 +380,11 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
|
|
384
380
|
|
385
381
|
def get_tool_descriptions() -> str:
|
386
382
|
"""Returns a description of all the tools that `generate_vision_code` has access to.
|
387
|
-
Helpful for
|
383
|
+
Helpful for answering questions about what types of vision tasks you can do with
|
388
384
|
`generate_vision_code`."""
|
389
385
|
return TOOL_DESCRIPTIONS
|
390
386
|
|
391
387
|
|
392
|
-
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
393
|
-
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
394
|
-
to detect objects in an image based on a given dataset. It returns the fine
|
395
|
-
tuning job id.
|
396
|
-
|
397
|
-
Parameters:
|
398
|
-
bboxes (List[BboxInput]): A list of BboxInput containing the
|
399
|
-
image path, labels and bounding boxes.
|
400
|
-
task (PromptTask): The florencev2 fine-tuning task. The options are
|
401
|
-
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
402
|
-
|
403
|
-
Returns:
|
404
|
-
UUID: The fine tuning job id, this id will used to retrieve the fine
|
405
|
-
tuned model.
|
406
|
-
|
407
|
-
Example
|
408
|
-
-------
|
409
|
-
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
410
|
-
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
411
|
-
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
412
|
-
"OBJECT_DETECTION"
|
413
|
-
)
|
414
|
-
"""
|
415
|
-
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
416
|
-
task_input = PromptTask[task]
|
417
|
-
fine_tuning_request = [
|
418
|
-
BboxInputBase64(
|
419
|
-
image=convert_to_b64(bbox_input.image_path),
|
420
|
-
filename=bbox_input.image_path.split("/")[-1],
|
421
|
-
labels=bbox_input.labels,
|
422
|
-
bboxes=bbox_input.bboxes,
|
423
|
-
)
|
424
|
-
for bbox_input in bboxes_input
|
425
|
-
]
|
426
|
-
landing_api = LandingPublicAPI()
|
427
|
-
return landing_api.launch_fine_tuning_job(
|
428
|
-
"florencev2", task_input, fine_tuning_request
|
429
|
-
)
|
430
|
-
|
431
|
-
|
432
388
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
433
389
|
[
|
434
390
|
get_tool_descriptions,
|
@@ -442,6 +398,5 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
442
398
|
search_dir,
|
443
399
|
search_file,
|
444
400
|
find_file,
|
445
|
-
florencev2_fine_tuning,
|
446
401
|
]
|
447
402
|
)
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Any, Callable, Dict, List, MutableMapping, Optional
|
4
|
+
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
5
5
|
|
6
6
|
import pandas as pd
|
7
7
|
from IPython.display import display
|
@@ -15,9 +15,10 @@ from vision_agent.utils.execute import Error, MimeType
|
|
15
15
|
from vision_agent.utils.type_defs import LandingaiAPIKey
|
16
16
|
|
17
17
|
_LOGGER = logging.getLogger(__name__)
|
18
|
-
_LND_API_KEY = LandingaiAPIKey().api_key
|
19
|
-
|
20
|
-
|
18
|
+
_LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
19
|
+
_LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
|
20
|
+
_LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
|
21
|
+
_LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
|
21
22
|
|
22
23
|
|
23
24
|
class ToolCallTrace(BaseModel):
|
@@ -28,8 +29,14 @@ class ToolCallTrace(BaseModel):
|
|
28
29
|
|
29
30
|
|
30
31
|
def send_inference_request(
|
31
|
-
payload: Dict[str, Any],
|
32
|
+
payload: Dict[str, Any],
|
33
|
+
endpoint_name: str,
|
34
|
+
files: Optional[List[Tuple[Any, ...]]] = None,
|
35
|
+
v2: bool = False,
|
36
|
+
metadata_payload: Optional[Dict[str, Any]] = None,
|
32
37
|
) -> Dict[str, Any]:
|
38
|
+
# TODO: runtime_tag and function_name should be metadata_payload and now included
|
39
|
+
# in the service payload
|
33
40
|
try:
|
34
41
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
35
42
|
payload["runtime_tag"] = runtime_tag
|
@@ -44,7 +51,7 @@ def send_inference_request(
|
|
44
51
|
response={},
|
45
52
|
error=None,
|
46
53
|
)
|
47
|
-
headers = {"
|
54
|
+
headers = {"apikey": _LND_API_KEY}
|
48
55
|
if "TOOL_ENDPOINT_AUTH" in os.environ:
|
49
56
|
headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
|
50
57
|
headers.pop("apikey")
|
@@ -54,7 +61,11 @@ def send_inference_request(
|
|
54
61
|
num_retry=3,
|
55
62
|
headers=headers,
|
56
63
|
)
|
57
|
-
|
64
|
+
|
65
|
+
if files is not None:
|
66
|
+
res = session.post(url, data=payload, files=files)
|
67
|
+
else:
|
68
|
+
res = session.post(url, json=payload)
|
58
69
|
if res.status_code != 200:
|
59
70
|
tool_call_trace.error = Error(
|
60
71
|
name="RemoteToolCallFailed",
|
@@ -62,9 +73,13 @@ def send_inference_request(
|
|
62
73
|
traceback_raw=[],
|
63
74
|
)
|
64
75
|
_LOGGER.error(f"Request failed: {res.status_code} {res.text}")
|
65
|
-
|
66
|
-
|
67
|
-
|
76
|
+
# TODO: function_name should be in metadata_payload
|
77
|
+
function_name = "unknown"
|
78
|
+
if "function_name" in payload:
|
79
|
+
function_name = payload["function_name"]
|
80
|
+
elif metadata_payload is not None and "function_name" in metadata_payload:
|
81
|
+
function_name = metadata_payload["function_name"]
|
82
|
+
raise RemoteToolCallFailed(function_name, res.status_code, res.text)
|
68
83
|
|
69
84
|
resp = res.json()
|
70
85
|
tool_call_trace.response = resp
|