vision-agent 0.2.110__tar.gz → 0.2.112__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. {vision_agent-0.2.110 → vision_agent-0.2.112}/PKG-INFO +1 -1
  2. {vision_agent-0.2.110 → vision_agent-0.2.112}/pyproject.toml +3 -1
  3. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/agent_utils.py +3 -8
  4. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/vision_agent.py +1 -1
  5. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder.py +28 -20
  6. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder_prompts.py +9 -7
  7. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_prompts.py +11 -10
  8. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/clients/http.py +15 -3
  9. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/clients/landing_public_api.py +14 -2
  10. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/tools/__init__.py +11 -5
  11. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/tools/meta_tools.py +1 -46
  12. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/tools/tool_utils.py +25 -10
  13. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/tools/tools.py +463 -99
  14. vision_agent-0.2.112/vision_agent/tools/tools_types.py +84 -0
  15. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/exceptions.py +13 -0
  16. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/execute.py +0 -1
  17. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/image_utils.py +52 -0
  18. vision_agent-0.2.110/vision_agent/tools/meta_tools_types.py +0 -30
  19. {vision_agent-0.2.110 → vision_agent-0.2.112}/LICENSE +0 -0
  20. {vision_agent-0.2.110 → vision_agent-0.2.112}/README.md +0 -0
  21. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/__init__.py +0 -0
  22. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/__init__.py +0 -0
  23. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/agent/agent.py +0 -0
  24. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/clients/__init__.py +0 -0
  25. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/fonts/__init__.py +0 -0
  26. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  27. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/lmm/__init__.py +0 -0
  28. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/lmm/lmm.py +0 -0
  29. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/lmm/types.py +0 -0
  30. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/tools/prompts.py +0 -0
  31. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/__init__.py +0 -0
  32. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/sim.py +0 -0
  33. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/type_defs.py +0 -0
  34. {vision_agent-0.2.110 → vision_agent-0.2.112}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.110
3
+ Version: 0.2.112
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.110"
7
+ version = "0.2.112"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -78,6 +78,8 @@ line_length = 88
78
78
  profile = "black"
79
79
 
80
80
  [tool.mypy]
81
+ plugins = "pydantic.mypy"
82
+
81
83
  exclude = "tests"
82
84
  show_error_context = true
83
85
  pretty = true
@@ -4,14 +4,13 @@ import sys
4
4
  from typing import Any, Dict
5
5
 
6
6
  logging.basicConfig(stream=sys.stdout)
7
- _LOGGER = logging.getLogger(__name__)
8
7
 
9
8
 
10
9
  def extract_json(json_str: str) -> Dict[str, Any]:
11
10
  try:
11
+ json_str = json_str.replace("\n", " ")
12
12
  json_dict = json.loads(json_str)
13
13
  except json.JSONDecodeError:
14
- input_json_str = json_str
15
14
  if "```json" in json_str:
16
15
  json_str = json_str[json_str.find("```json") + len("```json") :]
17
16
  json_str = json_str[: json_str.find("```")]
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
19
18
  json_str = json_str[json_str.find("```") + len("```") :]
20
19
  # get the last ``` not one from an intermediate string
21
20
  json_str = json_str[: json_str.find("}```")]
22
- try:
23
- json_dict = json.loads(json_str)
24
- except json.JSONDecodeError as e:
25
- error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
26
- _LOGGER.exception(error_msg)
27
- raise ValueError(error_msg) from e
21
+
22
+ json_dict = json.loads(json_str)
28
23
  return json_dict # type: ignore
29
24
 
30
25
 
@@ -28,7 +28,7 @@ class DefaultImports:
28
28
  code = [
29
29
  "from typing import *",
30
30
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
31
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
32
32
  ]
33
33
 
34
34
  @staticmethod
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import sys
6
6
  import tempfile
7
+ from json import JSONDecodeError
7
8
  from pathlib import Path
8
9
  from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
9
10
 
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
86
87
  def format_plans(plans: Dict[str, Any]) -> str:
87
88
  plan_str = ""
88
89
  for k, v in plans.items():
89
- plan_str += f"{k}:\n"
90
- plan_str += "-" + "\n-".join([e["instructions"] for e in v])
90
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
91
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
91
92
 
92
93
  return plan_str
93
94
 
@@ -228,13 +229,11 @@ def pick_plan(
228
229
  "status": "completed" if tool_output.success else "failed",
229
230
  }
230
231
  )
231
- tool_output_str = ""
232
- if len(tool_output.logs.stdout) > 0:
233
- tool_output_str = tool_output.logs.stdout[0]
232
+ tool_output_str = tool_output.text().strip()
234
233
 
235
234
  if verbosity == 2:
236
235
  _print_code("Code and test after attempted fix:", code)
237
- _LOGGER.info(f"Code execution result after attempte {count}")
236
+ _LOGGER.info(f"Code execution result after attempt {count}")
238
237
 
239
238
  count += 1
240
239
 
@@ -251,7 +250,21 @@ def pick_plan(
251
250
  tool_output=tool_output_str[:20_000],
252
251
  )
253
252
  chat[-1]["content"] = prompt
254
- best_plan = extract_json(model(chat, stream=False)) # type: ignore
253
+
254
+ count = 0
255
+ best_plan = None
256
+ while best_plan is None and count < max_retries:
257
+ try:
258
+ best_plan = extract_json(model(chat, stream=False)) # type: ignore
259
+ except JSONDecodeError as e:
260
+ _LOGGER.exception(
261
+ f"Error while extracting JSON during picking best plan {str(e)}"
262
+ )
263
+ pass
264
+ count += 1
265
+
266
+ if best_plan is None:
267
+ best_plan = {"best_plan": list(plans.keys())[0]}
255
268
 
256
269
  if verbosity >= 1:
257
270
  _LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
525
538
 
526
539
 
527
540
  def retrieve_tools(
528
- plans: Dict[str, List[Dict[str, str]]],
541
+ plans: Dict[str, Dict[str, Any]],
529
542
  tool_recommender: Sim,
530
543
  log_progress: Callable[[Dict[str, Any]], None],
531
544
  verbosity: int = 0,
@@ -542,8 +555,8 @@ def retrieve_tools(
542
555
  tool_lists: Dict[str, List[Dict[str, str]]] = {}
543
556
  for k, plan in plans.items():
544
557
  tool_lists[k] = []
545
- for task in plan:
546
- tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
558
+ for task in plan["instructions"]:
559
+ tools = tool_recommender.top_k(task, k=2, thresh=0.3)
547
560
  tool_info.extend([e["doc"] for e in tools])
548
561
  tool_desc.extend([e["desc"] for e in tools])
549
562
  tool_lists[k].extend(
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
737
750
  if self.verbosity >= 1:
738
751
  for p in plans:
739
752
  # tabulate will fail if the keys are not the same for all elements
740
- p_fixed = [
741
- {
742
- "instructions": (
743
- e["instructions"] if "instructions" in e else ""
744
- )
745
- }
746
- for e in plans[p]
747
- ]
753
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
748
754
  _LOGGER.info(
749
755
  f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
750
756
  )
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
793
799
  )
794
800
 
795
801
  if self.verbosity >= 1:
802
+ plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
796
803
  _LOGGER.info(
797
- f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
804
+ f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
798
805
  )
799
806
 
800
807
  results = write_and_test_code(
801
808
  chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
802
- plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
809
+ plan=f"\n{plan_i['thoughts']}\n-"
810
+ + "\n-".join([e for e in plan_i["instructions"]]),
803
811
  tool_info=tool_info,
804
812
  tool_output=tool_output_str,
805
813
  tool_utils=T.UTILITIES_DOCSTRING,
@@ -30,18 +30,19 @@ PLAN = """
30
30
 
31
31
  **Instructions**:
32
32
  1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33
- 2. Output three different plans each utilize a different strategy or tool.
33
+ 2. Output three different plans each utilize a different strategy or set of tools.
34
34
 
35
35
  Output a list of jsons in the following format
36
36
 
37
37
  ```json
38
38
  {{
39
39
  "plan1":
40
- [
41
- {{
42
- "instructions": str # what you should do in this task associated with a tool
43
- }}
44
- ],
40
+ {{
41
+ "thoughts": str # your thought process for choosing this plan
42
+ "instructions": [
43
+ str # what you should do in this task associated with a tool
44
+ ]
45
+ }},
45
46
  "plan2": ...,
46
47
  "plan3": ...
47
48
  }}
@@ -127,7 +128,8 @@ PICK_PLAN = """
127
128
 
128
129
  **Instructions**:
129
130
  1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
130
- 2. Output a JSON object with the following format:
131
+ 2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
132
+ 3. Output a JSON object with the following format:
131
133
  {{
132
134
  "thoughts": str # your thought process for choosing the best plan
133
135
  "best_plan": str # the best plan you have chosen
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
15
15
  **Examples**:
16
16
  Here is an example of how you can interact with a user and Actions to complete a task:
17
17
  --- START EXAMPLES ---
18
- [Current directory: /workspace/test]
18
+ [Current directory: /example/workspace]
19
19
  {examples}
20
20
  --- END EXAMPLES ---
21
21
 
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
27
27
  Here is the current conversation so far:
28
28
  --- START CONVERSATION ---
29
29
  [Current directory: {dir}]
30
+
30
31
  {conversation}
31
32
  """
32
33
 
33
34
  EXAMPLES_CODE1 = """
34
35
  USER: Can you detect the dogs in this image? Media name dog.jpg
35
36
 
36
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
37
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
37
38
 
38
39
  OBSERVATION:
39
- [File /workspace/test/dog_detector.py]
40
+ [File /example/workspace/dog_detector.py]
40
41
  0|from vision_agent.tools import load_image, owl_v2
41
42
  1|def detect_dogs(image_path: str):
42
43
  2| image = load_image(image_path)
@@ -44,7 +45,7 @@ OBSERVATION:
44
45
  4| return dogs
45
46
  [End of file]
46
47
 
47
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
48
+ AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
48
49
 
49
50
  OBSERVATION:
50
51
  ----- stdout -----
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
55
56
 
56
57
  USER: The the image only has one dog, can you fix this?
57
58
 
58
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
59
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
59
60
 
60
61
  OBSERVATION:
61
- [File /workspace/test/dog_detector.py]
62
+ [File /example/workspace/dog_detector.py]
62
63
  0|from vision_agent.tools import load_image, owl_v2
63
64
  1|def detect_dogs(image_path: str):
64
65
  2| image = load_image(image_path)
@@ -66,7 +67,7 @@ OBSERVATION:
66
67
  4| return dogs
67
68
  [End of file]
68
69
 
69
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
70
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
70
71
 
71
72
  OBSERVATION:
72
73
  ----- stdout -----
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
82
83
 
83
84
  USER: Yes you can use workers.png
84
85
 
85
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
86
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
86
87
 
87
88
  OBSERVATION:
88
- [File /workspace/test/code.py]
89
+ [File /example/workspace/code.py]
89
90
  0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
90
91
  1|def count_workers_with_helmets(image_path: str):
91
92
  2| image = load_image(image_path)
@@ -104,7 +105,7 @@ OBSERVATION:
104
105
  15| return count
105
106
  [End of file]
106
107
 
107
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
108
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
108
109
 
109
110
  OBSERVATION:
110
111
  ----- stdout -----
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
4
4
 
5
5
  from requests import Session
6
6
  from requests.adapters import HTTPAdapter
7
- from requests.exceptions import ConnectionError, RequestException, Timeout
8
7
 
9
8
  _LOGGER = logging.getLogger(__name__)
10
9
 
@@ -38,9 +37,22 @@ class BaseHTTP:
38
37
  response.raise_for_status()
39
38
  result: Dict[str, Any] = response.json()
40
39
  _LOGGER.info(json.dumps(result))
41
- except (ConnectionError, Timeout, RequestException) as err:
42
- _LOGGER.warning(f"Error: {err}.")
43
40
  except json.JSONDecodeError:
44
41
  resp_text = response.text
45
42
  _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
43
+ raise
44
+ return result
45
+
46
+ def get(self, url: str) -> Dict[str, Any]:
47
+ formatted_url = f"{self._base_endpoint}/{url}"
48
+ _LOGGER.info(f"Sending data to {formatted_url}")
49
+ try:
50
+ response = self._session.get(url=formatted_url, timeout=self._TIMEOUT)
51
+ response.raise_for_status()
52
+ result: Dict[str, Any] = response.json()
53
+ _LOGGER.info(json.dumps(result))
54
+ except json.JSONDecodeError:
55
+ resp_text = response.text
56
+ _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
57
+ raise
46
58
  return result
@@ -1,10 +1,13 @@
1
1
  import os
2
- from uuid import UUID
3
2
  from typing import List
3
+ from uuid import UUID
4
+
5
+ from requests.exceptions import HTTPError
4
6
 
5
7
  from vision_agent.clients.http import BaseHTTP
6
8
  from vision_agent.utils.type_defs import LandingaiAPIKey
7
- from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
9
+ from vision_agent.utils.exceptions import FineTuneModelNotFound
10
+ from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
8
11
 
9
12
 
10
13
  class LandingPublicAPI(BaseHTTP):
@@ -24,3 +27,12 @@ class LandingPublicAPI(BaseHTTP):
24
27
  }
25
28
  response = self.post(url, payload=data)
26
29
  return UUID(response["jobId"])
30
+
31
+ def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
32
+ url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
33
+ try:
34
+ get_job = self.get(url)
35
+ except HTTPError as err:
36
+ if err.response.status_code == 404:
37
+ raise FineTuneModelNotFound()
38
+ return JobStatus(get_job["status"])
@@ -1,6 +1,8 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning
3
+ from .meta_tools import (
4
+ META_TOOL_DOCSTRING,
5
+ )
4
6
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
7
  from .tools import (
6
8
  TOOL_DESCRIPTIONS,
@@ -17,16 +19,20 @@ from .tools import (
17
19
  detr_segmentation,
18
20
  dpt_hybrid_midas,
19
21
  extract_frames,
20
- florencev2_image_caption,
21
- florencev2_object_detection,
22
- florencev2_roberta_vqa,
23
- florencev2_ocr,
22
+ florence2_image_caption,
23
+ florence2_object_detection,
24
+ florence2_ocr,
25
+ florence2_roberta_vqa,
26
+ florence2_sam2_image,
27
+ florence2_sam2_video,
24
28
  generate_pose_image,
25
29
  generate_soft_edge_image,
26
30
  get_tool_documentation,
27
31
  git_vqa_v2,
28
32
  grounding_dino,
29
33
  grounding_sam,
34
+ ixc25_image_vqa,
35
+ ixc25_video_vqa,
30
36
  load_image,
31
37
  loca_visual_prompt_counting,
32
38
  loca_zero_shot_counting,
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import subprocess
3
- from uuid import UUID
4
3
  from pathlib import Path
5
4
  from typing import Any, Dict, List, Union
6
5
 
@@ -8,9 +7,6 @@ import vision_agent as va
8
7
  from vision_agent.lmm.types import Message
9
8
  from vision_agent.tools.tool_utils import get_tool_documentation
10
9
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
11
- from vision_agent.utils.image_utils import convert_to_b64
12
- from vision_agent.clients.landing_public_api import LandingPublicAPI
13
- from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
14
10
 
15
11
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
16
12
 
@@ -384,51 +380,11 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
384
380
 
385
381
  def get_tool_descriptions() -> str:
386
382
  """Returns a description of all the tools that `generate_vision_code` has access to.
387
- Helpful for answerings questions about what types of vision tasks you can do with
383
+ Helpful for answering questions about what types of vision tasks you can do with
388
384
  `generate_vision_code`."""
389
385
  return TOOL_DESCRIPTIONS
390
386
 
391
387
 
392
- def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
393
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
394
- to detect objects in an image based on a given dataset. It returns the fine
395
- tuning job id.
396
-
397
- Parameters:
398
- bboxes (List[BboxInput]): A list of BboxInput containing the
399
- image path, labels and bounding boxes.
400
- task (PromptTask): The florencev2 fine-tuning task. The options are
401
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
402
-
403
- Returns:
404
- UUID: The fine tuning job id, this id will used to retrieve the fine
405
- tuned model.
406
-
407
- Example
408
- -------
409
- >>> fine_tuning_job_id = florencev2_fine_tuning(
410
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
411
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
412
- "OBJECT_DETECTION"
413
- )
414
- """
415
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
416
- task_input = PromptTask[task]
417
- fine_tuning_request = [
418
- BboxInputBase64(
419
- image=convert_to_b64(bbox_input.image_path),
420
- filename=bbox_input.image_path.split("/")[-1],
421
- labels=bbox_input.labels,
422
- bboxes=bbox_input.bboxes,
423
- )
424
- for bbox_input in bboxes_input
425
- ]
426
- landing_api = LandingPublicAPI()
427
- return landing_api.launch_fine_tuning_job(
428
- "florencev2", task_input, fine_tuning_request
429
- )
430
-
431
-
432
388
  META_TOOL_DOCSTRING = get_tool_documentation(
433
389
  [
434
390
  get_tool_descriptions,
@@ -442,6 +398,5 @@ META_TOOL_DOCSTRING = get_tool_documentation(
442
398
  search_dir,
443
399
  search_file,
444
400
  find_file,
445
- florencev2_fine_tuning,
446
401
  ]
447
402
  )
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  import logging
3
3
  import os
4
- from typing import Any, Callable, Dict, List, MutableMapping, Optional
4
+ from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
5
5
 
6
6
  import pandas as pd
7
7
  from IPython.display import display
@@ -15,9 +15,10 @@ from vision_agent.utils.execute import Error, MimeType
15
15
  from vision_agent.utils.type_defs import LandingaiAPIKey
16
16
 
17
17
  _LOGGER = logging.getLogger(__name__)
18
- _LND_API_KEY = LandingaiAPIKey().api_key
19
- _LND_API_URL = "https://api.landing.ai/v1/agent/model"
20
- _LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
18
+ _LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
19
+ _LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
20
+ _LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
21
+ _LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
21
22
 
22
23
 
23
24
  class ToolCallTrace(BaseModel):
@@ -28,8 +29,14 @@ class ToolCallTrace(BaseModel):
28
29
 
29
30
 
30
31
  def send_inference_request(
31
- payload: Dict[str, Any], endpoint_name: str, v2: bool = False
32
+ payload: Dict[str, Any],
33
+ endpoint_name: str,
34
+ files: Optional[List[Tuple[Any, ...]]] = None,
35
+ v2: bool = False,
36
+ metadata_payload: Optional[Dict[str, Any]] = None,
32
37
  ) -> Dict[str, Any]:
38
+ # TODO: runtime_tag and function_name should be metadata_payload and now included
39
+ # in the service payload
33
40
  try:
34
41
  if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
35
42
  payload["runtime_tag"] = runtime_tag
@@ -44,7 +51,7 @@ def send_inference_request(
44
51
  response={},
45
52
  error=None,
46
53
  )
47
- headers = {"Content-Type": "application/json", "apikey": _LND_API_KEY}
54
+ headers = {"apikey": _LND_API_KEY}
48
55
  if "TOOL_ENDPOINT_AUTH" in os.environ:
49
56
  headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
50
57
  headers.pop("apikey")
@@ -54,7 +61,11 @@ def send_inference_request(
54
61
  num_retry=3,
55
62
  headers=headers,
56
63
  )
57
- res = session.post(url, json=payload)
64
+
65
+ if files is not None:
66
+ res = session.post(url, data=payload, files=files)
67
+ else:
68
+ res = session.post(url, json=payload)
58
69
  if res.status_code != 200:
59
70
  tool_call_trace.error = Error(
60
71
  name="RemoteToolCallFailed",
@@ -62,9 +73,13 @@ def send_inference_request(
62
73
  traceback_raw=[],
63
74
  )
64
75
  _LOGGER.error(f"Request failed: {res.status_code} {res.text}")
65
- raise RemoteToolCallFailed(
66
- payload["function_name"], res.status_code, res.text
67
- )
76
+ # TODO: function_name should be in metadata_payload
77
+ function_name = "unknown"
78
+ if "function_name" in payload:
79
+ function_name = payload["function_name"]
80
+ elif metadata_payload is not None and "function_name" in metadata_payload:
81
+ function_name = metadata_payload["function_name"]
82
+ raise RemoteToolCallFailed(function_name, res.status_code, res.text)
68
83
 
69
84
  resp = res.json()
70
85
  tool_call_trace.response = resp