vision-agent 0.2.98__tar.gz → 0.2.100__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.98 → vision_agent-0.2.100}/PKG-INFO +1 -1
  2. {vision_agent-0.2.98 → vision_agent-0.2.100}/pyproject.toml +4 -3
  3. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent.py +1 -1
  4. vision_agent-0.2.100/vision_agent/clients/http.py +46 -0
  5. vision_agent-0.2.100/vision_agent/clients/landing_public_api.py +26 -0
  6. vision_agent-0.2.100/vision_agent/fonts/__init__.py +0 -0
  7. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/__init__.py +1 -1
  8. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/meta_tools.py +45 -0
  9. vision_agent-0.2.100/vision_agent/tools/meta_tools_types.py +30 -0
  10. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/tool_utils.py +4 -3
  11. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/tools.py +55 -62
  12. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/execute.py +2 -2
  13. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/type_defs.py +1 -1
  14. {vision_agent-0.2.98 → vision_agent-0.2.100}/LICENSE +0 -0
  15. {vision_agent-0.2.98 → vision_agent-0.2.100}/README.md +0 -0
  16. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/__init__.py +0 -0
  17. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/__init__.py +0 -0
  18. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/agent.py +0 -0
  19. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/agent_utils.py +0 -0
  20. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_coder.py +0 -0
  21. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  22. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_prompts.py +0 -0
  23. {vision_agent-0.2.98/vision_agent/fonts → vision_agent-0.2.100/vision_agent/clients}/__init__.py +0 -0
  24. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  25. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/lmm/__init__.py +0 -0
  26. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/lmm/lmm.py +0 -0
  27. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/lmm/types.py +0 -0
  28. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/prompts.py +0 -0
  29. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/__init__.py +0 -0
  30. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/exceptions.py +0 -0
  31. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/image_utils.py +0 -0
  32. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/sim.py +0 -0
  33. {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.98
3
+ Version: 0.2.100
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.98"
7
+ version = "0.2.100"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -17,6 +17,7 @@ packages = [{include = "vision_agent"}]
17
17
 
18
18
  [tool.poetry.dependencies] # main dependency group
19
19
  python = ">=3.9,<4.0"
20
+
20
21
  numpy = ">=1.21.0,<2.0.0"
21
22
  pillow = "10.*"
22
23
  requests = "2.*"
@@ -60,6 +61,7 @@ mkdocstrings = {extras = ["python"], version = "^0.23.0"}
60
61
  mkdocs-material = "^9.4.2"
61
62
  types-tabulate = "^0.9.0.20240106"
62
63
  scikit-image = "<0.23.1"
64
+ pre-commit = "^3.8.0"
63
65
 
64
66
  [tool.pytest.ini_options]
65
67
  log_cli = true
@@ -90,7 +92,6 @@ warn_unused_configs = true
90
92
  warn_unused_ignores = true
91
93
  warn_return_any = true
92
94
  show_error_codes = true
93
- disallow_any_unimported = true
94
95
 
95
96
  [[tool.mypy.overrides]]
96
97
  ignore_missing_imports = true
@@ -101,5 +102,5 @@ module = [
101
102
  "sentence_transformers.*",
102
103
  "moviepy.*",
103
104
  "e2b_code_interpreter.*",
104
- "e2b.*",
105
+ "e2b.*"
105
106
  ]
@@ -28,7 +28,7 @@ class DefaultImports:
28
28
  code = [
29
29
  "from typing import *",
30
30
  "from vision_agent.utils.execute import CodeInterpreter",
31
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
31
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
32
32
  ]
33
33
 
34
34
  @staticmethod
@@ -0,0 +1,46 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Dict, Optional
4
+
5
+ from requests import Session
6
+ from requests.adapters import HTTPAdapter
7
+ from requests.exceptions import ConnectionError, RequestException, Timeout
8
+
9
+ _LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ class BaseHTTP:
13
+ _TIMEOUT = 30 # seconds
14
+ _MAX_RETRIES = 3
15
+
16
+ def __init__(
17
+ self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
18
+ ) -> None:
19
+ self._headers = headers
20
+ if headers is None:
21
+ self._headers = {
22
+ "Content-Type": "application/json",
23
+ }
24
+ self._base_endpoint = base_endpoint
25
+ self._session = Session()
26
+ self._session.headers.update(self._headers) # type: ignore
27
+ self._session.mount(
28
+ self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
29
+ )
30
+
31
+ def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
32
+ formatted_url = f"{self._base_endpoint}/{url}"
33
+ _LOGGER.info(f"Sending data to {formatted_url}")
34
+ try:
35
+ response = self._session.post(
36
+ url=formatted_url, json=payload, timeout=self._TIMEOUT
37
+ )
38
+ response.raise_for_status()
39
+ result: Dict[str, Any] = response.json()
40
+ _LOGGER.info(json.dumps(result))
41
+ except (ConnectionError, Timeout, RequestException) as err:
42
+ _LOGGER.warning(f"Error: {err}.")
43
+ except json.JSONDecodeError:
44
+ resp_text = response.text
45
+ _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
46
+ return result
@@ -0,0 +1,26 @@
1
+ import os
2
+ from uuid import UUID
3
+ from typing import List
4
+
5
+ from vision_agent.clients.http import BaseHTTP
6
+ from vision_agent.utils.type_defs import LandingaiAPIKey
7
+ from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
8
+
9
+
10
+ class LandingPublicAPI(BaseHTTP):
11
+ def __init__(self) -> None:
12
+ landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
13
+ landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
14
+ headers = {"Content-Type": "application/json", "apikey": landing_api_key}
15
+ super().__init__(base_endpoint=landing_url, headers=headers)
16
+
17
+ def launch_fine_tuning_job(
18
+ self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
19
+ ) -> UUID:
20
+ url = "v1/agent/jobs/fine-tuning"
21
+ data = {
22
+ "model": {"name": model_name, "task": task.value},
23
+ "bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
24
+ }
25
+ response = self.post(url, payload=data)
26
+ return UUID(response["jobId"])
File without changes
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING
3
+ from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning
4
4
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
5
  from .tools import (
6
6
  TOOL_DESCRIPTIONS,
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  import subprocess
3
+ from uuid import UUID
3
4
  from pathlib import Path
4
5
  from typing import Any, Dict, List, Union
5
6
 
@@ -7,6 +8,9 @@ import vision_agent as va
7
8
  from vision_agent.lmm.types import Message
8
9
  from vision_agent.tools.tool_utils import get_tool_documentation
9
10
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
11
+ from vision_agent.utils.image_utils import convert_to_b64
12
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
13
+ from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
10
14
 
11
15
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
12
16
 
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
385
389
  return TOOL_DESCRIPTIONS
386
390
 
387
391
 
392
+ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
393
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
394
+ to detect objects in an image based on a given dataset. It returns the fine
395
+ tuning job id.
396
+
397
+ Parameters:
398
+ bboxes (List[BboxInput]): A list of BboxInput containing the
399
+ image path, labels and bounding boxes.
400
+ task (PromptTask): The florencev2 fine-tuning task. The options are
401
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
402
+
403
+ Returns:
404
+ UUID: The fine tuning job id, this id will used to retrieve the fine
405
+ tuned model.
406
+
407
+ Example
408
+ -------
409
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
410
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
411
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
412
+ "OBJECT_DETECTION"
413
+ )
414
+ """
415
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
416
+ task_input = PromptTask[task]
417
+ fine_tuning_request = [
418
+ BboxInputBase64(
419
+ image=convert_to_b64(bbox_input.image_path),
420
+ filename=bbox_input.image_path.split("/")[-1],
421
+ labels=bbox_input.labels,
422
+ bboxes=bbox_input.bboxes,
423
+ )
424
+ for bbox_input in bboxes_input
425
+ ]
426
+ landing_api = LandingPublicAPI()
427
+ return landing_api.launch_fine_tuning_job(
428
+ "florencev2", task_input, fine_tuning_request
429
+ )
430
+
431
+
388
432
  META_TOOL_DOCSTRING = get_tool_documentation(
389
433
  [
390
434
  get_tool_descriptions,
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
398
442
  search_dir,
399
443
  search_file,
400
444
  find_file,
445
+ florencev2_fine_tuning,
401
446
  ]
402
447
  )
@@ -0,0 +1,30 @@
1
+ from enum import Enum
2
+ from typing import List, Tuple
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class BboxInput(BaseModel):
8
+ image_path: str
9
+ labels: List[str]
10
+ bboxes: List[Tuple[int, int, int, int]]
11
+
12
+
13
+ class BboxInputBase64(BaseModel):
14
+ image: str
15
+ filename: str
16
+ labels: List[str]
17
+ bboxes: List[Tuple[int, int, int, int]]
18
+
19
+
20
+ class PromptTask(str, Enum):
21
+ """
22
+ Valid task prompts options for the Florencev2 model.
23
+ """
24
+
25
+ CAPTION = "<CAPTION>"
26
+ """"""
27
+ CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
28
+ """"""
29
+ OBJECT_DETECTION = "<OD>"
30
+ """"""
@@ -16,7 +16,8 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
16
16
 
17
17
  _LOGGER = logging.getLogger(__name__)
18
18
  _LND_API_KEY = LandingaiAPIKey().api_key
19
- _LND_API_URL = "https://api.landing.ai/v1/agent"
19
+ _LND_API_URL = "https://api.landing.ai/v1/agent/model"
20
+ _LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
20
21
 
21
22
 
22
23
  class ToolCallTrace(BaseModel):
@@ -27,13 +28,13 @@ class ToolCallTrace(BaseModel):
27
28
 
28
29
 
29
30
  def send_inference_request(
30
- payload: Dict[str, Any], endpoint_name: str
31
+ payload: Dict[str, Any], endpoint_name: str, v2: bool = False
31
32
  ) -> Dict[str, Any]:
32
33
  try:
33
34
  if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
34
35
  payload["runtime_tag"] = runtime_tag
35
36
 
36
- url = f"{_LND_API_URL}/model/{endpoint_name}"
37
+ url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
37
38
  if "TOOL_ENDPOINT_URL" in os.environ:
38
39
  url = os.environ["TOOL_ENDPOINT_URL"]
39
40
 
@@ -2,23 +2,23 @@ import io
2
2
  import json
3
3
  import logging
4
4
  import tempfile
5
- from importlib import resources
6
5
  from pathlib import Path
6
+ from importlib import resources
7
7
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
8
 
9
9
  import cv2
10
- import numpy as np
11
10
  import requests
11
+ import numpy as np
12
+ from pytube import YouTube # type: ignore
12
13
  from moviepy.editor import ImageSequenceClip
13
14
  from PIL import Image, ImageDraw, ImageFont
14
15
  from pillow_heif import register_heif_opener # type: ignore
15
- from pytube import YouTube # type: ignore
16
16
 
17
17
  from vision_agent.tools.tool_utils import (
18
+ send_inference_request,
18
19
  get_tool_descriptions,
19
20
  get_tool_documentation,
20
21
  get_tools_df,
21
- send_inference_request,
22
22
  )
23
23
  from vision_agent.utils import extract_frames_from_video
24
24
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -126,7 +126,6 @@ def owl_v2(
126
126
  prompt: str,
127
127
  image: np.ndarray,
128
128
  box_threshold: float = 0.10,
129
- iou_threshold: float = 0.10,
130
129
  ) -> List[Dict[str, Any]]:
131
130
  """'owl_v2' is a tool that can detect and count multiple objects given a text
132
131
  prompt such as category names or referring expressions. The categories in text prompt
@@ -138,8 +137,6 @@ def owl_v2(
138
137
  image (np.ndarray): The image to ground the prompt to.
139
138
  box_threshold (float, optional): The threshold for the box detection. Defaults
140
139
  to 0.10.
141
- iou_threshold (float, optional): The threshold for the Intersection over Union
142
- (IoU). Defaults to 0.10.
143
140
 
144
141
  Returns:
145
142
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -159,22 +156,22 @@ def owl_v2(
159
156
  image_size = image.shape[:2]
160
157
  image_b64 = convert_to_b64(image)
161
158
  request_data = {
162
- "prompt": prompt,
159
+ "prompts": prompt.split("."),
163
160
  "image": image_b64,
164
- "tool": "open_vocab_detection",
165
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
161
+ "confidence": box_threshold,
166
162
  "function_name": "owl_v2",
167
163
  }
168
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
164
+ data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
169
165
  return_data = []
170
- for i in range(len(data["bboxes"])):
171
- return_data.append(
172
- {
173
- "score": round(data["scores"][i], 2),
174
- "label": data["labels"][i].strip(),
175
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
176
- }
177
- )
166
+ if data is not None:
167
+ for elt in data:
168
+ return_data.append(
169
+ {
170
+ "bbox": normalize_bbox(elt["bbox"], image_size), # type: ignore
171
+ "label": elt["label"], # type: ignore
172
+ "score": round(elt["score"], 2), # type: ignore
173
+ }
174
+ )
178
175
  return return_data
179
176
 
180
177
 
@@ -367,11 +364,10 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
367
364
  image_b64 = convert_to_b64(image)
368
365
  data = {
369
366
  "image": image_b64,
370
- "tool": "zero_shot_counting",
371
367
  "function_name": "loca_zero_shot_counting",
372
368
  }
373
- resp_data = send_inference_request(data, "tools")
374
- resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
369
+ resp_data = send_inference_request(data, "loca", v2=True)
370
+ resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
375
371
  return resp_data
376
372
 
377
373
 
@@ -397,17 +393,15 @@ def loca_visual_prompt_counting(
397
393
 
398
394
  image_size = get_image_size(image)
399
395
  bbox = visual_prompt["bbox"]
400
- bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
401
396
  image_b64 = convert_to_b64(image)
402
397
 
403
398
  data = {
404
399
  "image": image_b64,
405
- "prompt": bbox_str,
406
- "tool": "few_shot_counting",
400
+ "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
407
401
  "function_name": "loca_visual_prompt_counting",
408
402
  }
409
- resp_data = send_inference_request(data, "tools")
410
- resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
403
+ resp_data = send_inference_request(data, "loca", v2=True)
404
+ resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
411
405
  return resp_data
412
406
 
413
407
 
@@ -432,13 +426,12 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
432
426
  image_b64 = convert_to_b64(image)
433
427
  data = {
434
428
  "image": image_b64,
435
- "prompt": prompt,
436
- "tool": "image_question_answering_with_context",
429
+ "question": prompt,
437
430
  "function_name": "florencev2_roberta_vqa",
438
431
  }
439
432
 
440
- answer = send_inference_request(data, "tools")
441
- return answer["text"][0] # type: ignore
433
+ answer = send_inference_request(data, "florence2-qa", v2=True)
434
+ return answer # type: ignore
442
435
 
443
436
 
444
437
  def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
@@ -544,17 +537,16 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
544
537
  Example
545
538
  -------
546
539
  >>> vit_nsfw_classification(image)
547
- {"labels": "normal", "scores": 0.68},
540
+ {"label": "normal", "scores": 0.68},
548
541
  """
549
542
 
550
543
  image_b64 = convert_to_b64(image)
551
544
  data = {
552
545
  "image": image_b64,
553
- "tool": "nsfw_image_classification",
554
546
  "function_name": "vit_nsfw_classification",
555
547
  }
556
- resp_data = send_inference_request(data, "tools")
557
- resp_data["scores"] = round(resp_data["scores"], 4)
548
+ resp_data = send_inference_request(data, "nsfw-classification", v2=True)
549
+ resp_data["score"] = round(resp_data["score"], 4)
558
550
  return resp_data
559
551
 
560
552
 
@@ -603,21 +595,21 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
603
595
  'This image contains a cat sitting on a table with a bowl of milk.'
604
596
  """
605
597
  image_b64 = convert_to_b64(image)
598
+ task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
606
599
  data = {
607
600
  "image": image_b64,
608
- "tool": "florence2_image_captioning",
609
- "detail_caption": detail_caption,
601
+ "task": task,
610
602
  "function_name": "florencev2_image_caption",
611
603
  }
612
604
 
613
- answer = send_inference_request(data, "tools")
614
- return answer["text"][0] # type: ignore
605
+ answer = send_inference_request(data, "florence2", v2=True)
606
+ return answer[task] # type: ignore
615
607
 
616
608
 
617
- def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
618
- """'florencev2_object_detection' is a tool that can detect common objects in an
619
- image without any text prompt or thresholding. It returns a list of detected objects
620
- as labels and their location as bounding boxes.
609
+ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str, Any]]:
610
+ """'florencev2_object_detection' is a tool that can detect objects given a text
611
+ prompt such as a phrase or class names separated by commas. It returns a list of
612
+ detected objects as labels and their location as bounding boxes with score of 1.0.
621
613
 
622
614
  Parameters:
623
615
  image (np.ndarray): The image to used to detect objects
@@ -631,29 +623,30 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
631
623
 
632
624
  Example
633
625
  -------
634
- >>> florencev2_object_detection(image)
626
+ >>> florencev2_object_detection(image, 'person looking at a coyote')
635
627
  [
636
- {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
637
- {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
638
- {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
628
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
629
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
639
630
  ]
640
631
  """
641
632
  image_size = image.shape[:2]
642
633
  image_b64 = convert_to_b64(image)
643
634
  data = {
644
635
  "image": image_b64,
645
- "tool": "object_detection",
636
+ "task": "<CAPTION_TO_PHRASE_GROUNDING>",
637
+ "prompt": prompt,
646
638
  "function_name": "florencev2_object_detection",
647
639
  }
648
640
 
649
- answer = send_inference_request(data, "tools")
641
+ detections = send_inference_request(data, "florence2", v2=True)
642
+ detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
650
643
  return_data = []
651
- for i in range(len(answer["bboxes"])):
644
+ for i in range(len(detections["bboxes"])):
652
645
  return_data.append(
653
646
  {
654
- "score": round(answer["scores"][i], 2),
655
- "label": answer["labels"][i],
656
- "bbox": normalize_bbox(answer["bboxes"][i], image_size),
647
+ "score": 1.0,
648
+ "label": detections["labels"][i],
649
+ "bbox": normalize_bbox(detections["bboxes"][i], image_size),
657
650
  }
658
651
  )
659
652
  return return_data
@@ -742,13 +735,16 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
742
735
  image_b64 = convert_to_b64(image)
743
736
  data = {
744
737
  "image": image_b64,
745
- "tool": "generate_depth",
746
738
  "function_name": "depth_anything_v2",
747
739
  }
748
740
 
749
- answer = send_inference_request(data, "tools")
750
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
751
- return return_data
741
+ depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
742
+ depth_map_np = np.array(depth_map["map"])
743
+ depth_map_np = (depth_map_np - depth_map_np.min()) / (
744
+ depth_map_np.max() - depth_map_np.min()
745
+ )
746
+ depth_map_np = (255 * depth_map_np).astype(np.uint8)
747
+ return depth_map_np
752
748
 
753
749
 
754
750
  def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
@@ -839,12 +835,11 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
839
835
  image_b64 = convert_to_b64(image)
840
836
  data = {
841
837
  "image": image_b64,
842
- "tool": "generate_pose",
843
838
  "function_name": "generate_pose_image",
844
839
  }
845
840
 
846
- answer = send_inference_request(data, "tools")
847
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
841
+ pos_img = send_inference_request(data, "pose-detector", v2=True)
842
+ return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
848
843
  return return_data
849
844
 
850
845
 
@@ -1063,7 +1058,6 @@ def save_video(
1063
1058
  if fps <= 0:
1064
1059
  _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
1065
1060
  fps = 4
1066
-
1067
1061
  with ImageSequenceClip(frames, fps=fps) as video:
1068
1062
  if output_video_path:
1069
1063
  f = open(output_video_path, "wb")
@@ -1254,7 +1248,6 @@ TOOLS = [
1254
1248
  loca_visual_prompt_counting,
1255
1249
  florencev2_roberta_vqa,
1256
1250
  florencev2_image_caption,
1257
- florencev2_object_detection,
1258
1251
  detr_segmentation,
1259
1252
  depth_anything_v2,
1260
1253
  generate_soft_edge_image,
@@ -209,7 +209,7 @@ class Result:
209
209
  return formats
210
210
 
211
211
  @staticmethod
212
- def from_e2b_result(result: E2BResult) -> "Result": # type: ignore
212
+ def from_e2b_result(result: E2BResult) -> "Result":
213
213
  """
214
214
  Creates a Result object from an E2BResult object.
215
215
  """
@@ -361,7 +361,7 @@ class Execution(BaseModel):
361
361
  )
362
362
 
363
363
  @staticmethod
364
- def from_e2b_execution(exec: E2BExecution) -> "Execution": # type: ignore
364
+ def from_e2b_execution(exec: E2BExecution) -> "Execution":
365
365
  """Creates an Execution object from an E2BResult object."""
366
366
  return Execution(
367
367
  results=[Result.from_e2b_result(res) for res in exec.results],
@@ -14,7 +14,7 @@ class LandingaiAPIKey(BaseSettings):
14
14
  """
15
15
 
16
16
  api_key: str = Field(
17
- default="land_sk_fnmSzD0ksknSfvhyD8UGu9R4ss3bKfLL1Im5gb6tDQTy2z1Oy5",
17
+ default="land_sk_zKvyPcPV2bVoq7q87KwduoerAxuQpx33DnqP8M1BliOCiZOSoI",
18
18
  alias="LANDINGAI_API_KEY",
19
19
  description="The API key of LandingAI.",
20
20
  )
File without changes
File without changes