vision-agent 0.2.98__tar.gz → 0.2.100__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.98 → vision_agent-0.2.100}/PKG-INFO +1 -1
- {vision_agent-0.2.98 → vision_agent-0.2.100}/pyproject.toml +4 -3
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent.py +1 -1
- vision_agent-0.2.100/vision_agent/clients/http.py +46 -0
- vision_agent-0.2.100/vision_agent/clients/landing_public_api.py +26 -0
- vision_agent-0.2.100/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/__init__.py +1 -1
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/meta_tools.py +45 -0
- vision_agent-0.2.100/vision_agent/tools/meta_tools_types.py +30 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/tool_utils.py +4 -3
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/tools.py +55 -62
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/type_defs.py +1 -1
- {vision_agent-0.2.98 → vision_agent-0.2.100}/LICENSE +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/README.md +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.98/vision_agent/fonts → vision_agent-0.2.100/vision_agent/clients}/__init__.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/utils/video.py +0 -0
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "vision-agent"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.100"
|
8
8
|
description = "Toolset for Vision Agent"
|
9
9
|
authors = ["Landing AI <dev@landing.ai>"]
|
10
10
|
readme = "README.md"
|
@@ -17,6 +17,7 @@ packages = [{include = "vision_agent"}]
|
|
17
17
|
|
18
18
|
[tool.poetry.dependencies] # main dependency group
|
19
19
|
python = ">=3.9,<4.0"
|
20
|
+
|
20
21
|
numpy = ">=1.21.0,<2.0.0"
|
21
22
|
pillow = "10.*"
|
22
23
|
requests = "2.*"
|
@@ -60,6 +61,7 @@ mkdocstrings = {extras = ["python"], version = "^0.23.0"}
|
|
60
61
|
mkdocs-material = "^9.4.2"
|
61
62
|
types-tabulate = "^0.9.0.20240106"
|
62
63
|
scikit-image = "<0.23.1"
|
64
|
+
pre-commit = "^3.8.0"
|
63
65
|
|
64
66
|
[tool.pytest.ini_options]
|
65
67
|
log_cli = true
|
@@ -90,7 +92,6 @@ warn_unused_configs = true
|
|
90
92
|
warn_unused_ignores = true
|
91
93
|
warn_return_any = true
|
92
94
|
show_error_codes = true
|
93
|
-
disallow_any_unimported = true
|
94
95
|
|
95
96
|
[[tool.mypy.overrides]]
|
96
97
|
ignore_missing_imports = true
|
@@ -101,5 +102,5 @@ module = [
|
|
101
102
|
"sentence_transformers.*",
|
102
103
|
"moviepy.*",
|
103
104
|
"e2b_code_interpreter.*",
|
104
|
-
"e2b.*"
|
105
|
+
"e2b.*"
|
105
106
|
]
|
@@ -28,7 +28,7 @@ class DefaultImports:
|
|
28
28
|
code = [
|
29
29
|
"from typing import *",
|
30
30
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
|
32
32
|
]
|
33
33
|
|
34
34
|
@staticmethod
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
from requests import Session
|
6
|
+
from requests.adapters import HTTPAdapter
|
7
|
+
from requests.exceptions import ConnectionError, RequestException, Timeout
|
8
|
+
|
9
|
+
_LOGGER = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class BaseHTTP:
|
13
|
+
_TIMEOUT = 30 # seconds
|
14
|
+
_MAX_RETRIES = 3
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
|
18
|
+
) -> None:
|
19
|
+
self._headers = headers
|
20
|
+
if headers is None:
|
21
|
+
self._headers = {
|
22
|
+
"Content-Type": "application/json",
|
23
|
+
}
|
24
|
+
self._base_endpoint = base_endpoint
|
25
|
+
self._session = Session()
|
26
|
+
self._session.headers.update(self._headers) # type: ignore
|
27
|
+
self._session.mount(
|
28
|
+
self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
|
29
|
+
)
|
30
|
+
|
31
|
+
def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
32
|
+
formatted_url = f"{self._base_endpoint}/{url}"
|
33
|
+
_LOGGER.info(f"Sending data to {formatted_url}")
|
34
|
+
try:
|
35
|
+
response = self._session.post(
|
36
|
+
url=formatted_url, json=payload, timeout=self._TIMEOUT
|
37
|
+
)
|
38
|
+
response.raise_for_status()
|
39
|
+
result: Dict[str, Any] = response.json()
|
40
|
+
_LOGGER.info(json.dumps(result))
|
41
|
+
except (ConnectionError, Timeout, RequestException) as err:
|
42
|
+
_LOGGER.warning(f"Error: {err}.")
|
43
|
+
except json.JSONDecodeError:
|
44
|
+
resp_text = response.text
|
45
|
+
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
46
|
+
return result
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import os
|
2
|
+
from uuid import UUID
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from vision_agent.clients.http import BaseHTTP
|
6
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
7
|
+
from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
|
8
|
+
|
9
|
+
|
10
|
+
class LandingPublicAPI(BaseHTTP):
|
11
|
+
def __init__(self) -> None:
|
12
|
+
landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
|
13
|
+
landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
14
|
+
headers = {"Content-Type": "application/json", "apikey": landing_api_key}
|
15
|
+
super().__init__(base_endpoint=landing_url, headers=headers)
|
16
|
+
|
17
|
+
def launch_fine_tuning_job(
|
18
|
+
self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
|
19
|
+
) -> UUID:
|
20
|
+
url = "v1/agent/jobs/fine-tuning"
|
21
|
+
data = {
|
22
|
+
"model": {"name": model_name, "task": task.value},
|
23
|
+
"bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
|
24
|
+
}
|
25
|
+
response = self.post(url, payload=data)
|
26
|
+
return UUID(response["jobId"])
|
File without changes
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import subprocess
|
3
|
+
from uuid import UUID
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, Dict, List, Union
|
5
6
|
|
@@ -7,6 +8,9 @@ import vision_agent as va
|
|
7
8
|
from vision_agent.lmm.types import Message
|
8
9
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
9
10
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
11
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
12
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
13
|
+
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
|
10
14
|
|
11
15
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
12
16
|
|
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
|
|
385
389
|
return TOOL_DESCRIPTIONS
|
386
390
|
|
387
391
|
|
392
|
+
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
393
|
+
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
394
|
+
to detect objects in an image based on a given dataset. It returns the fine
|
395
|
+
tuning job id.
|
396
|
+
|
397
|
+
Parameters:
|
398
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
399
|
+
image path, labels and bounding boxes.
|
400
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
401
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
402
|
+
|
403
|
+
Returns:
|
404
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
405
|
+
tuned model.
|
406
|
+
|
407
|
+
Example
|
408
|
+
-------
|
409
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
410
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
411
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
412
|
+
"OBJECT_DETECTION"
|
413
|
+
)
|
414
|
+
"""
|
415
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
416
|
+
task_input = PromptTask[task]
|
417
|
+
fine_tuning_request = [
|
418
|
+
BboxInputBase64(
|
419
|
+
image=convert_to_b64(bbox_input.image_path),
|
420
|
+
filename=bbox_input.image_path.split("/")[-1],
|
421
|
+
labels=bbox_input.labels,
|
422
|
+
bboxes=bbox_input.bboxes,
|
423
|
+
)
|
424
|
+
for bbox_input in bboxes_input
|
425
|
+
]
|
426
|
+
landing_api = LandingPublicAPI()
|
427
|
+
return landing_api.launch_fine_tuning_job(
|
428
|
+
"florencev2", task_input, fine_tuning_request
|
429
|
+
)
|
430
|
+
|
431
|
+
|
388
432
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
389
433
|
[
|
390
434
|
get_tool_descriptions,
|
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
398
442
|
search_dir,
|
399
443
|
search_file,
|
400
444
|
find_file,
|
445
|
+
florencev2_fine_tuning,
|
401
446
|
]
|
402
447
|
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import List, Tuple
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
|
7
|
+
class BboxInput(BaseModel):
|
8
|
+
image_path: str
|
9
|
+
labels: List[str]
|
10
|
+
bboxes: List[Tuple[int, int, int, int]]
|
11
|
+
|
12
|
+
|
13
|
+
class BboxInputBase64(BaseModel):
|
14
|
+
image: str
|
15
|
+
filename: str
|
16
|
+
labels: List[str]
|
17
|
+
bboxes: List[Tuple[int, int, int, int]]
|
18
|
+
|
19
|
+
|
20
|
+
class PromptTask(str, Enum):
|
21
|
+
"""
|
22
|
+
Valid task prompts options for the Florencev2 model.
|
23
|
+
"""
|
24
|
+
|
25
|
+
CAPTION = "<CAPTION>"
|
26
|
+
""""""
|
27
|
+
CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
28
|
+
""""""
|
29
|
+
OBJECT_DETECTION = "<OD>"
|
30
|
+
""""""
|
@@ -16,7 +16,8 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
|
|
16
16
|
|
17
17
|
_LOGGER = logging.getLogger(__name__)
|
18
18
|
_LND_API_KEY = LandingaiAPIKey().api_key
|
19
|
-
_LND_API_URL = "https://api.landing.ai/v1/agent"
|
19
|
+
_LND_API_URL = "https://api.landing.ai/v1/agent/model"
|
20
|
+
_LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
|
20
21
|
|
21
22
|
|
22
23
|
class ToolCallTrace(BaseModel):
|
@@ -27,13 +28,13 @@ class ToolCallTrace(BaseModel):
|
|
27
28
|
|
28
29
|
|
29
30
|
def send_inference_request(
|
30
|
-
payload: Dict[str, Any], endpoint_name: str
|
31
|
+
payload: Dict[str, Any], endpoint_name: str, v2: bool = False
|
31
32
|
) -> Dict[str, Any]:
|
32
33
|
try:
|
33
34
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
34
35
|
payload["runtime_tag"] = runtime_tag
|
35
36
|
|
36
|
-
url = f"{_LND_API_URL}/
|
37
|
+
url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
|
37
38
|
if "TOOL_ENDPOINT_URL" in os.environ:
|
38
39
|
url = os.environ["TOOL_ENDPOINT_URL"]
|
39
40
|
|
@@ -2,23 +2,23 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from importlib import resources
|
6
5
|
from pathlib import Path
|
6
|
+
from importlib import resources
|
7
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
8
|
|
9
9
|
import cv2
|
10
|
-
import numpy as np
|
11
10
|
import requests
|
11
|
+
import numpy as np
|
12
|
+
from pytube import YouTube # type: ignore
|
12
13
|
from moviepy.editor import ImageSequenceClip
|
13
14
|
from PIL import Image, ImageDraw, ImageFont
|
14
15
|
from pillow_heif import register_heif_opener # type: ignore
|
15
|
-
from pytube import YouTube # type: ignore
|
16
16
|
|
17
17
|
from vision_agent.tools.tool_utils import (
|
18
|
+
send_inference_request,
|
18
19
|
get_tool_descriptions,
|
19
20
|
get_tool_documentation,
|
20
21
|
get_tools_df,
|
21
|
-
send_inference_request,
|
22
22
|
)
|
23
23
|
from vision_agent.utils import extract_frames_from_video
|
24
24
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -126,7 +126,6 @@ def owl_v2(
|
|
126
126
|
prompt: str,
|
127
127
|
image: np.ndarray,
|
128
128
|
box_threshold: float = 0.10,
|
129
|
-
iou_threshold: float = 0.10,
|
130
129
|
) -> List[Dict[str, Any]]:
|
131
130
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
132
131
|
prompt such as category names or referring expressions. The categories in text prompt
|
@@ -138,8 +137,6 @@ def owl_v2(
|
|
138
137
|
image (np.ndarray): The image to ground the prompt to.
|
139
138
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
140
139
|
to 0.10.
|
141
|
-
iou_threshold (float, optional): The threshold for the Intersection over Union
|
142
|
-
(IoU). Defaults to 0.10.
|
143
140
|
|
144
141
|
Returns:
|
145
142
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -159,22 +156,22 @@ def owl_v2(
|
|
159
156
|
image_size = image.shape[:2]
|
160
157
|
image_b64 = convert_to_b64(image)
|
161
158
|
request_data = {
|
162
|
-
"
|
159
|
+
"prompts": prompt.split("."),
|
163
160
|
"image": image_b64,
|
164
|
-
"
|
165
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
161
|
+
"confidence": box_threshold,
|
166
162
|
"function_name": "owl_v2",
|
167
163
|
}
|
168
|
-
data: Dict[str, Any] = send_inference_request(request_data, "
|
164
|
+
data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
|
169
165
|
return_data = []
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
166
|
+
if data is not None:
|
167
|
+
for elt in data:
|
168
|
+
return_data.append(
|
169
|
+
{
|
170
|
+
"bbox": normalize_bbox(elt["bbox"], image_size), # type: ignore
|
171
|
+
"label": elt["label"], # type: ignore
|
172
|
+
"score": round(elt["score"], 2), # type: ignore
|
173
|
+
}
|
174
|
+
)
|
178
175
|
return return_data
|
179
176
|
|
180
177
|
|
@@ -367,11 +364,10 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
367
364
|
image_b64 = convert_to_b64(image)
|
368
365
|
data = {
|
369
366
|
"image": image_b64,
|
370
|
-
"tool": "zero_shot_counting",
|
371
367
|
"function_name": "loca_zero_shot_counting",
|
372
368
|
}
|
373
|
-
resp_data = send_inference_request(data, "
|
374
|
-
resp_data["heat_map"] = np.array(
|
369
|
+
resp_data = send_inference_request(data, "loca", v2=True)
|
370
|
+
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
375
371
|
return resp_data
|
376
372
|
|
377
373
|
|
@@ -397,17 +393,15 @@ def loca_visual_prompt_counting(
|
|
397
393
|
|
398
394
|
image_size = get_image_size(image)
|
399
395
|
bbox = visual_prompt["bbox"]
|
400
|
-
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
401
396
|
image_b64 = convert_to_b64(image)
|
402
397
|
|
403
398
|
data = {
|
404
399
|
"image": image_b64,
|
405
|
-
"
|
406
|
-
"tool": "few_shot_counting",
|
400
|
+
"bbox": list(map(int, denormalize_bbox(bbox, image_size))),
|
407
401
|
"function_name": "loca_visual_prompt_counting",
|
408
402
|
}
|
409
|
-
resp_data = send_inference_request(data, "
|
410
|
-
resp_data["heat_map"] = np.array(
|
403
|
+
resp_data = send_inference_request(data, "loca", v2=True)
|
404
|
+
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
411
405
|
return resp_data
|
412
406
|
|
413
407
|
|
@@ -432,13 +426,12 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
432
426
|
image_b64 = convert_to_b64(image)
|
433
427
|
data = {
|
434
428
|
"image": image_b64,
|
435
|
-
"
|
436
|
-
"tool": "image_question_answering_with_context",
|
429
|
+
"question": prompt,
|
437
430
|
"function_name": "florencev2_roberta_vqa",
|
438
431
|
}
|
439
432
|
|
440
|
-
answer = send_inference_request(data, "
|
441
|
-
return answer
|
433
|
+
answer = send_inference_request(data, "florence2-qa", v2=True)
|
434
|
+
return answer # type: ignore
|
442
435
|
|
443
436
|
|
444
437
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
@@ -544,17 +537,16 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
544
537
|
Example
|
545
538
|
-------
|
546
539
|
>>> vit_nsfw_classification(image)
|
547
|
-
{"
|
540
|
+
{"label": "normal", "scores": 0.68},
|
548
541
|
"""
|
549
542
|
|
550
543
|
image_b64 = convert_to_b64(image)
|
551
544
|
data = {
|
552
545
|
"image": image_b64,
|
553
|
-
"tool": "nsfw_image_classification",
|
554
546
|
"function_name": "vit_nsfw_classification",
|
555
547
|
}
|
556
|
-
resp_data = send_inference_request(data, "
|
557
|
-
resp_data["
|
548
|
+
resp_data = send_inference_request(data, "nsfw-classification", v2=True)
|
549
|
+
resp_data["score"] = round(resp_data["score"], 4)
|
558
550
|
return resp_data
|
559
551
|
|
560
552
|
|
@@ -603,21 +595,21 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
603
595
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
604
596
|
"""
|
605
597
|
image_b64 = convert_to_b64(image)
|
598
|
+
task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
|
606
599
|
data = {
|
607
600
|
"image": image_b64,
|
608
|
-
"
|
609
|
-
"detail_caption": detail_caption,
|
601
|
+
"task": task,
|
610
602
|
"function_name": "florencev2_image_caption",
|
611
603
|
}
|
612
604
|
|
613
|
-
answer = send_inference_request(data, "
|
614
|
-
return answer[
|
605
|
+
answer = send_inference_request(data, "florence2", v2=True)
|
606
|
+
return answer[task] # type: ignore
|
615
607
|
|
616
608
|
|
617
|
-
def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
618
|
-
"""'florencev2_object_detection' is a tool that can detect
|
619
|
-
|
620
|
-
as labels and their location as bounding boxes.
|
609
|
+
def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str, Any]]:
|
610
|
+
"""'florencev2_object_detection' is a tool that can detect objects given a text
|
611
|
+
prompt such as a phrase or class names separated by commas. It returns a list of
|
612
|
+
detected objects as labels and their location as bounding boxes with score of 1.0.
|
621
613
|
|
622
614
|
Parameters:
|
623
615
|
image (np.ndarray): The image to used to detect objects
|
@@ -631,29 +623,30 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
631
623
|
|
632
624
|
Example
|
633
625
|
-------
|
634
|
-
>>> florencev2_object_detection(image)
|
626
|
+
>>> florencev2_object_detection(image, 'person looking at a coyote')
|
635
627
|
[
|
636
|
-
{'score': 1.0, 'label': '
|
637
|
-
{'score': 1.0, 'label': '
|
638
|
-
{'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
628
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
629
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
639
630
|
]
|
640
631
|
"""
|
641
632
|
image_size = image.shape[:2]
|
642
633
|
image_b64 = convert_to_b64(image)
|
643
634
|
data = {
|
644
635
|
"image": image_b64,
|
645
|
-
"
|
636
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
637
|
+
"prompt": prompt,
|
646
638
|
"function_name": "florencev2_object_detection",
|
647
639
|
}
|
648
640
|
|
649
|
-
|
641
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
642
|
+
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
650
643
|
return_data = []
|
651
|
-
for i in range(len(
|
644
|
+
for i in range(len(detections["bboxes"])):
|
652
645
|
return_data.append(
|
653
646
|
{
|
654
|
-
"score":
|
655
|
-
"label":
|
656
|
-
"bbox": normalize_bbox(
|
647
|
+
"score": 1.0,
|
648
|
+
"label": detections["labels"][i],
|
649
|
+
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
657
650
|
}
|
658
651
|
)
|
659
652
|
return return_data
|
@@ -742,13 +735,16 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
742
735
|
image_b64 = convert_to_b64(image)
|
743
736
|
data = {
|
744
737
|
"image": image_b64,
|
745
|
-
"tool": "generate_depth",
|
746
738
|
"function_name": "depth_anything_v2",
|
747
739
|
}
|
748
740
|
|
749
|
-
|
750
|
-
|
751
|
-
|
741
|
+
depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
|
742
|
+
depth_map_np = np.array(depth_map["map"])
|
743
|
+
depth_map_np = (depth_map_np - depth_map_np.min()) / (
|
744
|
+
depth_map_np.max() - depth_map_np.min()
|
745
|
+
)
|
746
|
+
depth_map_np = (255 * depth_map_np).astype(np.uint8)
|
747
|
+
return depth_map_np
|
752
748
|
|
753
749
|
|
754
750
|
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
@@ -839,12 +835,11 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
|
839
835
|
image_b64 = convert_to_b64(image)
|
840
836
|
data = {
|
841
837
|
"image": image_b64,
|
842
|
-
"tool": "generate_pose",
|
843
838
|
"function_name": "generate_pose_image",
|
844
839
|
}
|
845
840
|
|
846
|
-
|
847
|
-
return_data = np.array(b64_to_pil(
|
841
|
+
pos_img = send_inference_request(data, "pose-detector", v2=True)
|
842
|
+
return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
|
848
843
|
return return_data
|
849
844
|
|
850
845
|
|
@@ -1063,7 +1058,6 @@ def save_video(
|
|
1063
1058
|
if fps <= 0:
|
1064
1059
|
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
1065
1060
|
fps = 4
|
1066
|
-
|
1067
1061
|
with ImageSequenceClip(frames, fps=fps) as video:
|
1068
1062
|
if output_video_path:
|
1069
1063
|
f = open(output_video_path, "wb")
|
@@ -1254,7 +1248,6 @@ TOOLS = [
|
|
1254
1248
|
loca_visual_prompt_counting,
|
1255
1249
|
florencev2_roberta_vqa,
|
1256
1250
|
florencev2_image_caption,
|
1257
|
-
florencev2_object_detection,
|
1258
1251
|
detr_segmentation,
|
1259
1252
|
depth_anything_v2,
|
1260
1253
|
generate_soft_edge_image,
|
@@ -209,7 +209,7 @@ class Result:
|
|
209
209
|
return formats
|
210
210
|
|
211
211
|
@staticmethod
|
212
|
-
def from_e2b_result(result: E2BResult) -> "Result":
|
212
|
+
def from_e2b_result(result: E2BResult) -> "Result":
|
213
213
|
"""
|
214
214
|
Creates a Result object from an E2BResult object.
|
215
215
|
"""
|
@@ -361,7 +361,7 @@ class Execution(BaseModel):
|
|
361
361
|
)
|
362
362
|
|
363
363
|
@staticmethod
|
364
|
-
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
364
|
+
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
365
365
|
"""Creates an Execution object from an E2BResult object."""
|
366
366
|
return Execution(
|
367
367
|
results=[Result.from_e2b_result(res) for res in exec.results],
|
@@ -14,7 +14,7 @@ class LandingaiAPIKey(BaseSettings):
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
api_key: str = Field(
|
17
|
-
default="
|
17
|
+
default="land_sk_zKvyPcPV2bVoq7q87KwduoerAxuQpx33DnqP8M1BliOCiZOSoI",
|
18
18
|
alias="LANDINGAI_API_KEY",
|
19
19
|
description="The API key of LandingAI.",
|
20
20
|
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.98 → vision_agent-0.2.100}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-0.2.98/vision_agent/fonts → vision_agent-0.2.100/vision_agent/clients}/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|