vision-agent 0.2.98__py3-none-any.whl → 0.2.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/clients/__init__.py +0 -0
- vision_agent/clients/http.py +46 -0
- vision_agent/clients/landing_public_api.py +26 -0
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +45 -0
- vision_agent/tools/meta_tools_types.py +30 -0
- vision_agent/tools/tool_utils.py +4 -3
- vision_agent/tools/tools.py +55 -62
- vision_agent/utils/execute.py +2 -2
- vision_agent/utils/type_defs.py +1 -1
- {vision_agent-0.2.98.dist-info → vision_agent-0.2.100.dist-info}/METADATA +1 -1
- {vision_agent-0.2.98.dist-info → vision_agent-0.2.100.dist-info}/RECORD +15 -11
- {vision_agent-0.2.98.dist-info → vision_agent-0.2.100.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.98.dist-info → vision_agent-0.2.100.dist-info}/WHEEL +0 -0
@@ -28,7 +28,7 @@ class DefaultImports:
|
|
28
28
|
code = [
|
29
29
|
"from typing import *",
|
30
30
|
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
-
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
|
32
32
|
]
|
33
33
|
|
34
34
|
@staticmethod
|
File without changes
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
from requests import Session
|
6
|
+
from requests.adapters import HTTPAdapter
|
7
|
+
from requests.exceptions import ConnectionError, RequestException, Timeout
|
8
|
+
|
9
|
+
_LOGGER = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class BaseHTTP:
|
13
|
+
_TIMEOUT = 30 # seconds
|
14
|
+
_MAX_RETRIES = 3
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
|
18
|
+
) -> None:
|
19
|
+
self._headers = headers
|
20
|
+
if headers is None:
|
21
|
+
self._headers = {
|
22
|
+
"Content-Type": "application/json",
|
23
|
+
}
|
24
|
+
self._base_endpoint = base_endpoint
|
25
|
+
self._session = Session()
|
26
|
+
self._session.headers.update(self._headers) # type: ignore
|
27
|
+
self._session.mount(
|
28
|
+
self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
|
29
|
+
)
|
30
|
+
|
31
|
+
def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
32
|
+
formatted_url = f"{self._base_endpoint}/{url}"
|
33
|
+
_LOGGER.info(f"Sending data to {formatted_url}")
|
34
|
+
try:
|
35
|
+
response = self._session.post(
|
36
|
+
url=formatted_url, json=payload, timeout=self._TIMEOUT
|
37
|
+
)
|
38
|
+
response.raise_for_status()
|
39
|
+
result: Dict[str, Any] = response.json()
|
40
|
+
_LOGGER.info(json.dumps(result))
|
41
|
+
except (ConnectionError, Timeout, RequestException) as err:
|
42
|
+
_LOGGER.warning(f"Error: {err}.")
|
43
|
+
except json.JSONDecodeError:
|
44
|
+
resp_text = response.text
|
45
|
+
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
|
46
|
+
return result
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import os
|
2
|
+
from uuid import UUID
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from vision_agent.clients.http import BaseHTTP
|
6
|
+
from vision_agent.utils.type_defs import LandingaiAPIKey
|
7
|
+
from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
|
8
|
+
|
9
|
+
|
10
|
+
class LandingPublicAPI(BaseHTTP):
|
11
|
+
def __init__(self) -> None:
|
12
|
+
landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
|
13
|
+
landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
14
|
+
headers = {"Content-Type": "application/json", "apikey": landing_api_key}
|
15
|
+
super().__init__(base_endpoint=landing_url, headers=headers)
|
16
|
+
|
17
|
+
def launch_fine_tuning_job(
|
18
|
+
self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
|
19
|
+
) -> UUID:
|
20
|
+
url = "v1/agent/jobs/fine-tuning"
|
21
|
+
data = {
|
22
|
+
"model": {"name": model_name, "task": task.value},
|
23
|
+
"bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
|
24
|
+
}
|
25
|
+
response = self.post(url, payload=data)
|
26
|
+
return UUID(response["jobId"])
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import subprocess
|
3
|
+
from uuid import UUID
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, Dict, List, Union
|
5
6
|
|
@@ -7,6 +8,9 @@ import vision_agent as va
|
|
7
8
|
from vision_agent.lmm.types import Message
|
8
9
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
9
10
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
11
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
12
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
13
|
+
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
|
10
14
|
|
11
15
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
12
16
|
|
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
|
|
385
389
|
return TOOL_DESCRIPTIONS
|
386
390
|
|
387
391
|
|
392
|
+
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
393
|
+
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
394
|
+
to detect objects in an image based on a given dataset. It returns the fine
|
395
|
+
tuning job id.
|
396
|
+
|
397
|
+
Parameters:
|
398
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
399
|
+
image path, labels and bounding boxes.
|
400
|
+
task (PromptTask): The florencev2 fine-tuning task. The options are
|
401
|
+
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
402
|
+
|
403
|
+
Returns:
|
404
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
405
|
+
tuned model.
|
406
|
+
|
407
|
+
Example
|
408
|
+
-------
|
409
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
410
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
411
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
412
|
+
"OBJECT_DETECTION"
|
413
|
+
)
|
414
|
+
"""
|
415
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
416
|
+
task_input = PromptTask[task]
|
417
|
+
fine_tuning_request = [
|
418
|
+
BboxInputBase64(
|
419
|
+
image=convert_to_b64(bbox_input.image_path),
|
420
|
+
filename=bbox_input.image_path.split("/")[-1],
|
421
|
+
labels=bbox_input.labels,
|
422
|
+
bboxes=bbox_input.bboxes,
|
423
|
+
)
|
424
|
+
for bbox_input in bboxes_input
|
425
|
+
]
|
426
|
+
landing_api = LandingPublicAPI()
|
427
|
+
return landing_api.launch_fine_tuning_job(
|
428
|
+
"florencev2", task_input, fine_tuning_request
|
429
|
+
)
|
430
|
+
|
431
|
+
|
388
432
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
389
433
|
[
|
390
434
|
get_tool_descriptions,
|
@@ -398,5 +442,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
398
442
|
search_dir,
|
399
443
|
search_file,
|
400
444
|
find_file,
|
445
|
+
florencev2_fine_tuning,
|
401
446
|
]
|
402
447
|
)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import List, Tuple
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
|
7
|
+
class BboxInput(BaseModel):
|
8
|
+
image_path: str
|
9
|
+
labels: List[str]
|
10
|
+
bboxes: List[Tuple[int, int, int, int]]
|
11
|
+
|
12
|
+
|
13
|
+
class BboxInputBase64(BaseModel):
|
14
|
+
image: str
|
15
|
+
filename: str
|
16
|
+
labels: List[str]
|
17
|
+
bboxes: List[Tuple[int, int, int, int]]
|
18
|
+
|
19
|
+
|
20
|
+
class PromptTask(str, Enum):
|
21
|
+
"""
|
22
|
+
Valid task prompts options for the Florencev2 model.
|
23
|
+
"""
|
24
|
+
|
25
|
+
CAPTION = "<CAPTION>"
|
26
|
+
""""""
|
27
|
+
CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
28
|
+
""""""
|
29
|
+
OBJECT_DETECTION = "<OD>"
|
30
|
+
""""""
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -16,7 +16,8 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
|
|
16
16
|
|
17
17
|
_LOGGER = logging.getLogger(__name__)
|
18
18
|
_LND_API_KEY = LandingaiAPIKey().api_key
|
19
|
-
_LND_API_URL = "https://api.landing.ai/v1/agent"
|
19
|
+
_LND_API_URL = "https://api.landing.ai/v1/agent/model"
|
20
|
+
_LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
|
20
21
|
|
21
22
|
|
22
23
|
class ToolCallTrace(BaseModel):
|
@@ -27,13 +28,13 @@ class ToolCallTrace(BaseModel):
|
|
27
28
|
|
28
29
|
|
29
30
|
def send_inference_request(
|
30
|
-
payload: Dict[str, Any], endpoint_name: str
|
31
|
+
payload: Dict[str, Any], endpoint_name: str, v2: bool = False
|
31
32
|
) -> Dict[str, Any]:
|
32
33
|
try:
|
33
34
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
34
35
|
payload["runtime_tag"] = runtime_tag
|
35
36
|
|
36
|
-
url = f"{_LND_API_URL}/
|
37
|
+
url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
|
37
38
|
if "TOOL_ENDPOINT_URL" in os.environ:
|
38
39
|
url = os.environ["TOOL_ENDPOINT_URL"]
|
39
40
|
|
vision_agent/tools/tools.py
CHANGED
@@ -2,23 +2,23 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from importlib import resources
|
6
5
|
from pathlib import Path
|
6
|
+
from importlib import resources
|
7
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
8
|
|
9
9
|
import cv2
|
10
|
-
import numpy as np
|
11
10
|
import requests
|
11
|
+
import numpy as np
|
12
|
+
from pytube import YouTube # type: ignore
|
12
13
|
from moviepy.editor import ImageSequenceClip
|
13
14
|
from PIL import Image, ImageDraw, ImageFont
|
14
15
|
from pillow_heif import register_heif_opener # type: ignore
|
15
|
-
from pytube import YouTube # type: ignore
|
16
16
|
|
17
17
|
from vision_agent.tools.tool_utils import (
|
18
|
+
send_inference_request,
|
18
19
|
get_tool_descriptions,
|
19
20
|
get_tool_documentation,
|
20
21
|
get_tools_df,
|
21
|
-
send_inference_request,
|
22
22
|
)
|
23
23
|
from vision_agent.utils import extract_frames_from_video
|
24
24
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -126,7 +126,6 @@ def owl_v2(
|
|
126
126
|
prompt: str,
|
127
127
|
image: np.ndarray,
|
128
128
|
box_threshold: float = 0.10,
|
129
|
-
iou_threshold: float = 0.10,
|
130
129
|
) -> List[Dict[str, Any]]:
|
131
130
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
132
131
|
prompt such as category names or referring expressions. The categories in text prompt
|
@@ -138,8 +137,6 @@ def owl_v2(
|
|
138
137
|
image (np.ndarray): The image to ground the prompt to.
|
139
138
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
140
139
|
to 0.10.
|
141
|
-
iou_threshold (float, optional): The threshold for the Intersection over Union
|
142
|
-
(IoU). Defaults to 0.10.
|
143
140
|
|
144
141
|
Returns:
|
145
142
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -159,22 +156,22 @@ def owl_v2(
|
|
159
156
|
image_size = image.shape[:2]
|
160
157
|
image_b64 = convert_to_b64(image)
|
161
158
|
request_data = {
|
162
|
-
"
|
159
|
+
"prompts": prompt.split("."),
|
163
160
|
"image": image_b64,
|
164
|
-
"
|
165
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
161
|
+
"confidence": box_threshold,
|
166
162
|
"function_name": "owl_v2",
|
167
163
|
}
|
168
|
-
data: Dict[str, Any] = send_inference_request(request_data, "
|
164
|
+
data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
|
169
165
|
return_data = []
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
166
|
+
if data is not None:
|
167
|
+
for elt in data:
|
168
|
+
return_data.append(
|
169
|
+
{
|
170
|
+
"bbox": normalize_bbox(elt["bbox"], image_size), # type: ignore
|
171
|
+
"label": elt["label"], # type: ignore
|
172
|
+
"score": round(elt["score"], 2), # type: ignore
|
173
|
+
}
|
174
|
+
)
|
178
175
|
return return_data
|
179
176
|
|
180
177
|
|
@@ -367,11 +364,10 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
367
364
|
image_b64 = convert_to_b64(image)
|
368
365
|
data = {
|
369
366
|
"image": image_b64,
|
370
|
-
"tool": "zero_shot_counting",
|
371
367
|
"function_name": "loca_zero_shot_counting",
|
372
368
|
}
|
373
|
-
resp_data = send_inference_request(data, "
|
374
|
-
resp_data["heat_map"] = np.array(
|
369
|
+
resp_data = send_inference_request(data, "loca", v2=True)
|
370
|
+
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
375
371
|
return resp_data
|
376
372
|
|
377
373
|
|
@@ -397,17 +393,15 @@ def loca_visual_prompt_counting(
|
|
397
393
|
|
398
394
|
image_size = get_image_size(image)
|
399
395
|
bbox = visual_prompt["bbox"]
|
400
|
-
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
401
396
|
image_b64 = convert_to_b64(image)
|
402
397
|
|
403
398
|
data = {
|
404
399
|
"image": image_b64,
|
405
|
-
"
|
406
|
-
"tool": "few_shot_counting",
|
400
|
+
"bbox": list(map(int, denormalize_bbox(bbox, image_size))),
|
407
401
|
"function_name": "loca_visual_prompt_counting",
|
408
402
|
}
|
409
|
-
resp_data = send_inference_request(data, "
|
410
|
-
resp_data["heat_map"] = np.array(
|
403
|
+
resp_data = send_inference_request(data, "loca", v2=True)
|
404
|
+
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
411
405
|
return resp_data
|
412
406
|
|
413
407
|
|
@@ -432,13 +426,12 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
432
426
|
image_b64 = convert_to_b64(image)
|
433
427
|
data = {
|
434
428
|
"image": image_b64,
|
435
|
-
"
|
436
|
-
"tool": "image_question_answering_with_context",
|
429
|
+
"question": prompt,
|
437
430
|
"function_name": "florencev2_roberta_vqa",
|
438
431
|
}
|
439
432
|
|
440
|
-
answer = send_inference_request(data, "
|
441
|
-
return answer
|
433
|
+
answer = send_inference_request(data, "florence2-qa", v2=True)
|
434
|
+
return answer # type: ignore
|
442
435
|
|
443
436
|
|
444
437
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
@@ -544,17 +537,16 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
544
537
|
Example
|
545
538
|
-------
|
546
539
|
>>> vit_nsfw_classification(image)
|
547
|
-
{"
|
540
|
+
{"label": "normal", "scores": 0.68},
|
548
541
|
"""
|
549
542
|
|
550
543
|
image_b64 = convert_to_b64(image)
|
551
544
|
data = {
|
552
545
|
"image": image_b64,
|
553
|
-
"tool": "nsfw_image_classification",
|
554
546
|
"function_name": "vit_nsfw_classification",
|
555
547
|
}
|
556
|
-
resp_data = send_inference_request(data, "
|
557
|
-
resp_data["
|
548
|
+
resp_data = send_inference_request(data, "nsfw-classification", v2=True)
|
549
|
+
resp_data["score"] = round(resp_data["score"], 4)
|
558
550
|
return resp_data
|
559
551
|
|
560
552
|
|
@@ -603,21 +595,21 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
603
595
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
604
596
|
"""
|
605
597
|
image_b64 = convert_to_b64(image)
|
598
|
+
task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
|
606
599
|
data = {
|
607
600
|
"image": image_b64,
|
608
|
-
"
|
609
|
-
"detail_caption": detail_caption,
|
601
|
+
"task": task,
|
610
602
|
"function_name": "florencev2_image_caption",
|
611
603
|
}
|
612
604
|
|
613
|
-
answer = send_inference_request(data, "
|
614
|
-
return answer[
|
605
|
+
answer = send_inference_request(data, "florence2", v2=True)
|
606
|
+
return answer[task] # type: ignore
|
615
607
|
|
616
608
|
|
617
|
-
def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
618
|
-
"""'florencev2_object_detection' is a tool that can detect
|
619
|
-
|
620
|
-
as labels and their location as bounding boxes.
|
609
|
+
def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str, Any]]:
|
610
|
+
"""'florencev2_object_detection' is a tool that can detect objects given a text
|
611
|
+
prompt such as a phrase or class names separated by commas. It returns a list of
|
612
|
+
detected objects as labels and their location as bounding boxes with score of 1.0.
|
621
613
|
|
622
614
|
Parameters:
|
623
615
|
image (np.ndarray): The image to used to detect objects
|
@@ -631,29 +623,30 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
631
623
|
|
632
624
|
Example
|
633
625
|
-------
|
634
|
-
>>> florencev2_object_detection(image)
|
626
|
+
>>> florencev2_object_detection(image, 'person looking at a coyote')
|
635
627
|
[
|
636
|
-
{'score': 1.0, 'label': '
|
637
|
-
{'score': 1.0, 'label': '
|
638
|
-
{'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
628
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
629
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
639
630
|
]
|
640
631
|
"""
|
641
632
|
image_size = image.shape[:2]
|
642
633
|
image_b64 = convert_to_b64(image)
|
643
634
|
data = {
|
644
635
|
"image": image_b64,
|
645
|
-
"
|
636
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
637
|
+
"prompt": prompt,
|
646
638
|
"function_name": "florencev2_object_detection",
|
647
639
|
}
|
648
640
|
|
649
|
-
|
641
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
642
|
+
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
650
643
|
return_data = []
|
651
|
-
for i in range(len(
|
644
|
+
for i in range(len(detections["bboxes"])):
|
652
645
|
return_data.append(
|
653
646
|
{
|
654
|
-
"score":
|
655
|
-
"label":
|
656
|
-
"bbox": normalize_bbox(
|
647
|
+
"score": 1.0,
|
648
|
+
"label": detections["labels"][i],
|
649
|
+
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
657
650
|
}
|
658
651
|
)
|
659
652
|
return return_data
|
@@ -742,13 +735,16 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
742
735
|
image_b64 = convert_to_b64(image)
|
743
736
|
data = {
|
744
737
|
"image": image_b64,
|
745
|
-
"tool": "generate_depth",
|
746
738
|
"function_name": "depth_anything_v2",
|
747
739
|
}
|
748
740
|
|
749
|
-
|
750
|
-
|
751
|
-
|
741
|
+
depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
|
742
|
+
depth_map_np = np.array(depth_map["map"])
|
743
|
+
depth_map_np = (depth_map_np - depth_map_np.min()) / (
|
744
|
+
depth_map_np.max() - depth_map_np.min()
|
745
|
+
)
|
746
|
+
depth_map_np = (255 * depth_map_np).astype(np.uint8)
|
747
|
+
return depth_map_np
|
752
748
|
|
753
749
|
|
754
750
|
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
@@ -839,12 +835,11 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
|
839
835
|
image_b64 = convert_to_b64(image)
|
840
836
|
data = {
|
841
837
|
"image": image_b64,
|
842
|
-
"tool": "generate_pose",
|
843
838
|
"function_name": "generate_pose_image",
|
844
839
|
}
|
845
840
|
|
846
|
-
|
847
|
-
return_data = np.array(b64_to_pil(
|
841
|
+
pos_img = send_inference_request(data, "pose-detector", v2=True)
|
842
|
+
return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
|
848
843
|
return return_data
|
849
844
|
|
850
845
|
|
@@ -1063,7 +1058,6 @@ def save_video(
|
|
1063
1058
|
if fps <= 0:
|
1064
1059
|
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
1065
1060
|
fps = 4
|
1066
|
-
|
1067
1061
|
with ImageSequenceClip(frames, fps=fps) as video:
|
1068
1062
|
if output_video_path:
|
1069
1063
|
f = open(output_video_path, "wb")
|
@@ -1254,7 +1248,6 @@ TOOLS = [
|
|
1254
1248
|
loca_visual_prompt_counting,
|
1255
1249
|
florencev2_roberta_vqa,
|
1256
1250
|
florencev2_image_caption,
|
1257
|
-
florencev2_object_detection,
|
1258
1251
|
detr_segmentation,
|
1259
1252
|
depth_anything_v2,
|
1260
1253
|
generate_soft_edge_image,
|
vision_agent/utils/execute.py
CHANGED
@@ -209,7 +209,7 @@ class Result:
|
|
209
209
|
return formats
|
210
210
|
|
211
211
|
@staticmethod
|
212
|
-
def from_e2b_result(result: E2BResult) -> "Result":
|
212
|
+
def from_e2b_result(result: E2BResult) -> "Result":
|
213
213
|
"""
|
214
214
|
Creates a Result object from an E2BResult object.
|
215
215
|
"""
|
@@ -361,7 +361,7 @@ class Execution(BaseModel):
|
|
361
361
|
)
|
362
362
|
|
363
363
|
@staticmethod
|
364
|
-
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
364
|
+
def from_e2b_execution(exec: E2BExecution) -> "Execution":
|
365
365
|
"""Creates an Execution object from an E2BResult object."""
|
366
366
|
return Execution(
|
367
367
|
results=[Result.from_e2b_result(res) for res in exec.results],
|
vision_agent/utils/type_defs.py
CHANGED
@@ -14,7 +14,7 @@ class LandingaiAPIKey(BaseSettings):
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
api_key: str = Field(
|
17
|
-
default="
|
17
|
+
default="land_sk_zKvyPcPV2bVoq7q87KwduoerAxuQpx33DnqP8M1BliOCiZOSoI",
|
18
18
|
alias="LANDINGAI_API_KEY",
|
19
19
|
description="The API key of LandingAI.",
|
20
20
|
)
|
@@ -2,28 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9Ks,135
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=N8oVwfxrz6emHlucJC5hGQvkA9cQWW2sMLFtshwLdI8,30309
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
|
9
|
+
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
vision_agent/clients/http.py,sha256=1WMt29F12YFfPH03AttKxnUNXx5sNOD9ZuH4etbB054,1598
|
11
|
+
vision_agent/clients/landing_public_api.py,sha256=Tjl8uBZWc3dvrCOKg-PCYjw3RC3X5Y6B50kaKn_QzL0,1050
|
9
12
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
13
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
11
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
12
15
|
vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
|
13
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
14
|
-
vision_agent/tools/__init__.py,sha256=
|
15
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
|
+
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
16
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
17
|
-
vision_agent/tools/tool_utils.py,sha256=
|
18
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tool_utils.py,sha256=Zg2aP58UqVRUlEtekWwSwGK5Z5c0eyNrKOvAfEyY4Ik,4694
|
22
|
+
vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
|
19
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
20
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
21
|
-
vision_agent/utils/execute.py,sha256=
|
25
|
+
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
22
26
|
vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
|
23
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
24
|
-
vision_agent/utils/type_defs.py,sha256=
|
28
|
+
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
25
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
26
|
-
vision_agent-0.2.
|
27
|
-
vision_agent-0.2.
|
28
|
-
vision_agent-0.2.
|
29
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.100.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.100.dist-info/METADATA,sha256=aoWhGb6-cKJpae77m_JsrUP7ljLz1LHVnmYLHSA7-U0,10729
|
32
|
+
vision_agent-0.2.100.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.100.dist-info/RECORD,,
|
File without changes
|
File without changes
|