vision-agent 0.2.110__py3-none-any.whl → 0.2.112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,14 +4,13 @@ import sys
4
4
  from typing import Any, Dict
5
5
 
6
6
  logging.basicConfig(stream=sys.stdout)
7
- _LOGGER = logging.getLogger(__name__)
8
7
 
9
8
 
10
9
  def extract_json(json_str: str) -> Dict[str, Any]:
11
10
  try:
11
+ json_str = json_str.replace("\n", " ")
12
12
  json_dict = json.loads(json_str)
13
13
  except json.JSONDecodeError:
14
- input_json_str = json_str
15
14
  if "```json" in json_str:
16
15
  json_str = json_str[json_str.find("```json") + len("```json") :]
17
16
  json_str = json_str[: json_str.find("```")]
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
19
18
  json_str = json_str[json_str.find("```") + len("```") :]
20
19
  # get the last ``` not one from an intermediate string
21
20
  json_str = json_str[: json_str.find("}```")]
22
- try:
23
- json_dict = json.loads(json_str)
24
- except json.JSONDecodeError as e:
25
- error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
26
- _LOGGER.exception(error_msg)
27
- raise ValueError(error_msg) from e
21
+
22
+ json_dict = json.loads(json_str)
28
23
  return json_dict # type: ignore
29
24
 
30
25
 
@@ -28,7 +28,7 @@ class DefaultImports:
28
28
  code = [
29
29
  "from typing import *",
30
30
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
31
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
32
32
  ]
33
33
 
34
34
  @staticmethod
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import sys
6
6
  import tempfile
7
+ from json import JSONDecodeError
7
8
  from pathlib import Path
8
9
  from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
9
10
 
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
86
87
  def format_plans(plans: Dict[str, Any]) -> str:
87
88
  plan_str = ""
88
89
  for k, v in plans.items():
89
- plan_str += f"{k}:\n"
90
- plan_str += "-" + "\n-".join([e["instructions"] for e in v])
90
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
91
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
91
92
 
92
93
  return plan_str
93
94
 
@@ -228,13 +229,11 @@ def pick_plan(
228
229
  "status": "completed" if tool_output.success else "failed",
229
230
  }
230
231
  )
231
- tool_output_str = ""
232
- if len(tool_output.logs.stdout) > 0:
233
- tool_output_str = tool_output.logs.stdout[0]
232
+ tool_output_str = tool_output.text().strip()
234
233
 
235
234
  if verbosity == 2:
236
235
  _print_code("Code and test after attempted fix:", code)
237
- _LOGGER.info(f"Code execution result after attempte {count}")
236
+ _LOGGER.info(f"Code execution result after attempt {count}")
238
237
 
239
238
  count += 1
240
239
 
@@ -251,7 +250,21 @@ def pick_plan(
251
250
  tool_output=tool_output_str[:20_000],
252
251
  )
253
252
  chat[-1]["content"] = prompt
254
- best_plan = extract_json(model(chat, stream=False)) # type: ignore
253
+
254
+ count = 0
255
+ best_plan = None
256
+ while best_plan is None and count < max_retries:
257
+ try:
258
+ best_plan = extract_json(model(chat, stream=False)) # type: ignore
259
+ except JSONDecodeError as e:
260
+ _LOGGER.exception(
261
+ f"Error while extracting JSON during picking best plan {str(e)}"
262
+ )
263
+ pass
264
+ count += 1
265
+
266
+ if best_plan is None:
267
+ best_plan = {"best_plan": list(plans.keys())[0]}
255
268
 
256
269
  if verbosity >= 1:
257
270
  _LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
525
538
 
526
539
 
527
540
  def retrieve_tools(
528
- plans: Dict[str, List[Dict[str, str]]],
541
+ plans: Dict[str, Dict[str, Any]],
529
542
  tool_recommender: Sim,
530
543
  log_progress: Callable[[Dict[str, Any]], None],
531
544
  verbosity: int = 0,
@@ -542,8 +555,8 @@ def retrieve_tools(
542
555
  tool_lists: Dict[str, List[Dict[str, str]]] = {}
543
556
  for k, plan in plans.items():
544
557
  tool_lists[k] = []
545
- for task in plan:
546
- tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
558
+ for task in plan["instructions"]:
559
+ tools = tool_recommender.top_k(task, k=2, thresh=0.3)
547
560
  tool_info.extend([e["doc"] for e in tools])
548
561
  tool_desc.extend([e["desc"] for e in tools])
549
562
  tool_lists[k].extend(
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
737
750
  if self.verbosity >= 1:
738
751
  for p in plans:
739
752
  # tabulate will fail if the keys are not the same for all elements
740
- p_fixed = [
741
- {
742
- "instructions": (
743
- e["instructions"] if "instructions" in e else ""
744
- )
745
- }
746
- for e in plans[p]
747
- ]
753
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
748
754
  _LOGGER.info(
749
755
  f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
750
756
  )
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
793
799
  )
794
800
 
795
801
  if self.verbosity >= 1:
802
+ plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
796
803
  _LOGGER.info(
797
- f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
804
+ f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
798
805
  )
799
806
 
800
807
  results = write_and_test_code(
801
808
  chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
802
- plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
809
+ plan=f"\n{plan_i['thoughts']}\n-"
810
+ + "\n-".join([e for e in plan_i["instructions"]]),
803
811
  tool_info=tool_info,
804
812
  tool_output=tool_output_str,
805
813
  tool_utils=T.UTILITIES_DOCSTRING,
@@ -30,18 +30,19 @@ PLAN = """
30
30
 
31
31
  **Instructions**:
32
32
  1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33
- 2. Output three different plans each utilize a different strategy or tool.
33
+ 2. Output three different plans each utilize a different strategy or set of tools.
34
34
 
35
35
  Output a list of jsons in the following format
36
36
 
37
37
  ```json
38
38
  {{
39
39
  "plan1":
40
- [
41
- {{
42
- "instructions": str # what you should do in this task associated with a tool
43
- }}
44
- ],
40
+ {{
41
+ "thoughts": str # your thought process for choosing this plan
42
+ "instructions": [
43
+ str # what you should do in this task associated with a tool
44
+ ]
45
+ }},
45
46
  "plan2": ...,
46
47
  "plan3": ...
47
48
  }}
@@ -127,7 +128,8 @@ PICK_PLAN = """
127
128
 
128
129
  **Instructions**:
129
130
  1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
130
- 2. Output a JSON object with the following format:
131
+ 2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
132
+ 3. Output a JSON object with the following format:
131
133
  {{
132
134
  "thoughts": str # your thought process for choosing the best plan
133
135
  "best_plan": str # the best plan you have chosen
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
15
15
  **Examples**:
16
16
  Here is an example of how you can interact with a user and Actions to complete a task:
17
17
  --- START EXAMPLES ---
18
- [Current directory: /workspace/test]
18
+ [Current directory: /example/workspace]
19
19
  {examples}
20
20
  --- END EXAMPLES ---
21
21
 
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
27
27
  Here is the current conversation so far:
28
28
  --- START CONVERSATION ---
29
29
  [Current directory: {dir}]
30
+
30
31
  {conversation}
31
32
  """
32
33
 
33
34
  EXAMPLES_CODE1 = """
34
35
  USER: Can you detect the dogs in this image? Media name dog.jpg
35
36
 
36
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
37
+ AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
37
38
 
38
39
  OBSERVATION:
39
- [File /workspace/test/dog_detector.py]
40
+ [File /example/workspace/dog_detector.py]
40
41
  0|from vision_agent.tools import load_image, owl_v2
41
42
  1|def detect_dogs(image_path: str):
42
43
  2| image = load_image(image_path)
@@ -44,7 +45,7 @@ OBSERVATION:
44
45
  4| return dogs
45
46
  [End of file]
46
47
 
47
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
48
+ AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
48
49
 
49
50
  OBSERVATION:
50
51
  ----- stdout -----
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
55
56
 
56
57
  USER: The the image only has one dog, can you fix this?
57
58
 
58
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
59
+ AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
59
60
 
60
61
  OBSERVATION:
61
- [File /workspace/test/dog_detector.py]
62
+ [File /example/workspace/dog_detector.py]
62
63
  0|from vision_agent.tools import load_image, owl_v2
63
64
  1|def detect_dogs(image_path: str):
64
65
  2| image = load_image(image_path)
@@ -66,7 +67,7 @@ OBSERVATION:
66
67
  4| return dogs
67
68
  [End of file]
68
69
 
69
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
70
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
70
71
 
71
72
  OBSERVATION:
72
73
  ----- stdout -----
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
82
83
 
83
84
  USER: Yes you can use workers.png
84
85
 
85
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
86
+ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
86
87
 
87
88
  OBSERVATION:
88
- [File /workspace/test/code.py]
89
+ [File /example/workspace/code.py]
89
90
  0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
90
91
  1|def count_workers_with_helmets(image_path: str):
91
92
  2| image = load_image(image_path)
@@ -104,7 +105,7 @@ OBSERVATION:
104
105
  15| return count
105
106
  [End of file]
106
107
 
107
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
108
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
108
109
 
109
110
  OBSERVATION:
110
111
  ----- stdout -----
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
4
4
 
5
5
  from requests import Session
6
6
  from requests.adapters import HTTPAdapter
7
- from requests.exceptions import ConnectionError, RequestException, Timeout
8
7
 
9
8
  _LOGGER = logging.getLogger(__name__)
10
9
 
@@ -38,9 +37,22 @@ class BaseHTTP:
38
37
  response.raise_for_status()
39
38
  result: Dict[str, Any] = response.json()
40
39
  _LOGGER.info(json.dumps(result))
41
- except (ConnectionError, Timeout, RequestException) as err:
42
- _LOGGER.warning(f"Error: {err}.")
43
40
  except json.JSONDecodeError:
44
41
  resp_text = response.text
45
42
  _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
43
+ raise
44
+ return result
45
+
46
+ def get(self, url: str) -> Dict[str, Any]:
47
+ formatted_url = f"{self._base_endpoint}/{url}"
48
+ _LOGGER.info(f"Sending data to {formatted_url}")
49
+ try:
50
+ response = self._session.get(url=formatted_url, timeout=self._TIMEOUT)
51
+ response.raise_for_status()
52
+ result: Dict[str, Any] = response.json()
53
+ _LOGGER.info(json.dumps(result))
54
+ except json.JSONDecodeError:
55
+ resp_text = response.text
56
+ _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
57
+ raise
46
58
  return result
@@ -1,10 +1,13 @@
1
1
  import os
2
- from uuid import UUID
3
2
  from typing import List
3
+ from uuid import UUID
4
+
5
+ from requests.exceptions import HTTPError
4
6
 
5
7
  from vision_agent.clients.http import BaseHTTP
6
8
  from vision_agent.utils.type_defs import LandingaiAPIKey
7
- from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
9
+ from vision_agent.utils.exceptions import FineTuneModelNotFound
10
+ from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
8
11
 
9
12
 
10
13
  class LandingPublicAPI(BaseHTTP):
@@ -24,3 +27,12 @@ class LandingPublicAPI(BaseHTTP):
24
27
  }
25
28
  response = self.post(url, payload=data)
26
29
  return UUID(response["jobId"])
30
+
31
+ def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
32
+ url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
33
+ try:
34
+ get_job = self.get(url)
35
+ except HTTPError as err:
36
+ if err.response.status_code == 404:
37
+ raise FineTuneModelNotFound()
38
+ return JobStatus(get_job["status"])
@@ -1,6 +1,8 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning
3
+ from .meta_tools import (
4
+ META_TOOL_DOCSTRING,
5
+ )
4
6
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
7
  from .tools import (
6
8
  TOOL_DESCRIPTIONS,
@@ -17,16 +19,20 @@ from .tools import (
17
19
  detr_segmentation,
18
20
  dpt_hybrid_midas,
19
21
  extract_frames,
20
- florencev2_image_caption,
21
- florencev2_object_detection,
22
- florencev2_roberta_vqa,
23
- florencev2_ocr,
22
+ florence2_image_caption,
23
+ florence2_object_detection,
24
+ florence2_ocr,
25
+ florence2_roberta_vqa,
26
+ florence2_sam2_image,
27
+ florence2_sam2_video,
24
28
  generate_pose_image,
25
29
  generate_soft_edge_image,
26
30
  get_tool_documentation,
27
31
  git_vqa_v2,
28
32
  grounding_dino,
29
33
  grounding_sam,
34
+ ixc25_image_vqa,
35
+ ixc25_video_vqa,
30
36
  load_image,
31
37
  loca_visual_prompt_counting,
32
38
  loca_zero_shot_counting,
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import subprocess
3
- from uuid import UUID
4
3
  from pathlib import Path
5
4
  from typing import Any, Dict, List, Union
6
5
 
@@ -8,9 +7,6 @@ import vision_agent as va
8
7
  from vision_agent.lmm.types import Message
9
8
  from vision_agent.tools.tool_utils import get_tool_documentation
10
9
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
11
- from vision_agent.utils.image_utils import convert_to_b64
12
- from vision_agent.clients.landing_public_api import LandingPublicAPI
13
- from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
14
10
 
15
11
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
16
12
 
@@ -384,51 +380,11 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
384
380
 
385
381
  def get_tool_descriptions() -> str:
386
382
  """Returns a description of all the tools that `generate_vision_code` has access to.
387
- Helpful for answerings questions about what types of vision tasks you can do with
383
+ Helpful for answering questions about what types of vision tasks you can do with
388
384
  `generate_vision_code`."""
389
385
  return TOOL_DESCRIPTIONS
390
386
 
391
387
 
392
- def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
393
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
394
- to detect objects in an image based on a given dataset. It returns the fine
395
- tuning job id.
396
-
397
- Parameters:
398
- bboxes (List[BboxInput]): A list of BboxInput containing the
399
- image path, labels and bounding boxes.
400
- task (PromptTask): The florencev2 fine-tuning task. The options are
401
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
402
-
403
- Returns:
404
- UUID: The fine tuning job id, this id will used to retrieve the fine
405
- tuned model.
406
-
407
- Example
408
- -------
409
- >>> fine_tuning_job_id = florencev2_fine_tuning(
410
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
411
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
412
- "OBJECT_DETECTION"
413
- )
414
- """
415
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
416
- task_input = PromptTask[task]
417
- fine_tuning_request = [
418
- BboxInputBase64(
419
- image=convert_to_b64(bbox_input.image_path),
420
- filename=bbox_input.image_path.split("/")[-1],
421
- labels=bbox_input.labels,
422
- bboxes=bbox_input.bboxes,
423
- )
424
- for bbox_input in bboxes_input
425
- ]
426
- landing_api = LandingPublicAPI()
427
- return landing_api.launch_fine_tuning_job(
428
- "florencev2", task_input, fine_tuning_request
429
- )
430
-
431
-
432
388
  META_TOOL_DOCSTRING = get_tool_documentation(
433
389
  [
434
390
  get_tool_descriptions,
@@ -442,6 +398,5 @@ META_TOOL_DOCSTRING = get_tool_documentation(
442
398
  search_dir,
443
399
  search_file,
444
400
  find_file,
445
- florencev2_fine_tuning,
446
401
  ]
447
402
  )
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  import logging
3
3
  import os
4
- from typing import Any, Callable, Dict, List, MutableMapping, Optional
4
+ from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
5
5
 
6
6
  import pandas as pd
7
7
  from IPython.display import display
@@ -15,9 +15,10 @@ from vision_agent.utils.execute import Error, MimeType
15
15
  from vision_agent.utils.type_defs import LandingaiAPIKey
16
16
 
17
17
  _LOGGER = logging.getLogger(__name__)
18
- _LND_API_KEY = LandingaiAPIKey().api_key
19
- _LND_API_URL = "https://api.landing.ai/v1/agent/model"
20
- _LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
18
+ _LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
19
+ _LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
20
+ _LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
21
+ _LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
21
22
 
22
23
 
23
24
  class ToolCallTrace(BaseModel):
@@ -28,8 +29,14 @@ class ToolCallTrace(BaseModel):
28
29
 
29
30
 
30
31
  def send_inference_request(
31
- payload: Dict[str, Any], endpoint_name: str, v2: bool = False
32
+ payload: Dict[str, Any],
33
+ endpoint_name: str,
34
+ files: Optional[List[Tuple[Any, ...]]] = None,
35
+ v2: bool = False,
36
+ metadata_payload: Optional[Dict[str, Any]] = None,
32
37
  ) -> Dict[str, Any]:
38
+ # TODO: runtime_tag and function_name should be metadata_payload and now included
39
+ # in the service payload
33
40
  try:
34
41
  if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
35
42
  payload["runtime_tag"] = runtime_tag
@@ -44,7 +51,7 @@ def send_inference_request(
44
51
  response={},
45
52
  error=None,
46
53
  )
47
- headers = {"Content-Type": "application/json", "apikey": _LND_API_KEY}
54
+ headers = {"apikey": _LND_API_KEY}
48
55
  if "TOOL_ENDPOINT_AUTH" in os.environ:
49
56
  headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
50
57
  headers.pop("apikey")
@@ -54,7 +61,11 @@ def send_inference_request(
54
61
  num_retry=3,
55
62
  headers=headers,
56
63
  )
57
- res = session.post(url, json=payload)
64
+
65
+ if files is not None:
66
+ res = session.post(url, data=payload, files=files)
67
+ else:
68
+ res = session.post(url, json=payload)
58
69
  if res.status_code != 200:
59
70
  tool_call_trace.error = Error(
60
71
  name="RemoteToolCallFailed",
@@ -62,9 +73,13 @@ def send_inference_request(
62
73
  traceback_raw=[],
63
74
  )
64
75
  _LOGGER.error(f"Request failed: {res.status_code} {res.text}")
65
- raise RemoteToolCallFailed(
66
- payload["function_name"], res.status_code, res.text
67
- )
76
+ # TODO: function_name should be in metadata_payload
77
+ function_name = "unknown"
78
+ if "function_name" in payload:
79
+ function_name = payload["function_name"]
80
+ elif metadata_payload is not None and "function_name" in metadata_payload:
81
+ function_name = metadata_payload["function_name"]
82
+ raise RemoteToolCallFailed(function_name, res.status_code, res.text)
68
83
 
69
84
  resp = res.json()
70
85
  tool_call_trace.response = resp