vision-agent 0.0.50__py3-none-any.whl → 0.0.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +35 -14
- vision_agent/agent/vision_agent_prompts.py +1 -3
- vision_agent/fonts/__init__.py +0 -0
- vision_agent/fonts/default_font_ch_en.ttf +0 -0
- vision_agent/image_utils.py +22 -10
- vision_agent/tools/tools.py +40 -90
- {vision_agent-0.0.50.dist-info → vision_agent-0.0.52.dist-info}/METADATA +3 -2
- {vision_agent-0.0.50.dist-info → vision_agent-0.0.52.dist-info}/RECORD +10 -8
- {vision_agent-0.0.50.dist-info → vision_agent-0.0.52.dist-info}/LICENSE +0 -0
- {vision_agent-0.0.50.dist-info → vision_agent-0.0.52.dist-info}/WHEEL +0 -0
@@ -5,6 +5,7 @@ import tempfile
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
7
7
|
|
8
|
+
from PIL import Image
|
8
9
|
from tabulate import tabulate
|
9
10
|
|
10
11
|
from vision_agent.image_utils import overlay_bboxes, overlay_masks
|
@@ -288,9 +289,8 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
|
288
289
|
continue
|
289
290
|
parameters = [parameters]
|
290
291
|
elif isinstance(tool_result["parameters"], list):
|
291
|
-
if (
|
292
|
-
|
293
|
-
and "image" not in tool_result["parameters"][0]
|
292
|
+
if len(tool_result["parameters"]) < 1 or (
|
293
|
+
"image" not in tool_result["parameters"][0]
|
294
294
|
):
|
295
295
|
continue
|
296
296
|
|
@@ -304,10 +304,16 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
|
304
304
|
# if the call was successful, then we can add the image data
|
305
305
|
image = param["image"]
|
306
306
|
if image not in image_to_data:
|
307
|
-
image_to_data[image] = {
|
307
|
+
image_to_data[image] = {
|
308
|
+
"bboxes": [],
|
309
|
+
"masks": [],
|
310
|
+
"labels": [],
|
311
|
+
"scores": [],
|
312
|
+
}
|
308
313
|
|
309
314
|
image_to_data[image]["bboxes"].extend(call_result["bboxes"])
|
310
315
|
image_to_data[image]["labels"].extend(call_result["labels"])
|
316
|
+
image_to_data[image]["scores"].extend(call_result["scores"])
|
311
317
|
if "masks" in call_result:
|
312
318
|
image_to_data[image]["masks"].extend(call_result["masks"])
|
313
319
|
|
@@ -345,7 +351,7 @@ class VisionAgent(Agent):
|
|
345
351
|
task_model: Optional[Union[LLM, LMM]] = None,
|
346
352
|
answer_model: Optional[Union[LLM, LMM]] = None,
|
347
353
|
reflect_model: Optional[Union[LLM, LMM]] = None,
|
348
|
-
max_retries: int =
|
354
|
+
max_retries: int = 3,
|
349
355
|
verbose: bool = False,
|
350
356
|
report_progress_callback: Optional[Callable[[str], None]] = None,
|
351
357
|
):
|
@@ -380,6 +386,7 @@ class VisionAgent(Agent):
|
|
380
386
|
self,
|
381
387
|
input: Union[List[Dict[str, str]], str],
|
382
388
|
image: Optional[Union[str, Path]] = None,
|
389
|
+
visualize_output: Optional[bool] = False,
|
383
390
|
) -> str:
|
384
391
|
"""Invoke the vision agent.
|
385
392
|
|
@@ -393,7 +400,7 @@ class VisionAgent(Agent):
|
|
393
400
|
"""
|
394
401
|
if isinstance(input, str):
|
395
402
|
input = [{"role": "user", "content": input}]
|
396
|
-
return self.chat(input, image=image)
|
403
|
+
return self.chat(input, image=image, visualize_output=visualize_output)
|
397
404
|
|
398
405
|
def log_progress(self, description: str) -> None:
|
399
406
|
_LOGGER.info(description)
|
@@ -401,7 +408,10 @@ class VisionAgent(Agent):
|
|
401
408
|
self.report_progress_callback(description)
|
402
409
|
|
403
410
|
def chat_with_workflow(
|
404
|
-
self,
|
411
|
+
self,
|
412
|
+
chat: List[Dict[str, str]],
|
413
|
+
image: Optional[Union[str, Path]] = None,
|
414
|
+
visualize_output: Optional[bool] = False,
|
405
415
|
) -> Tuple[str, List[Dict]]:
|
406
416
|
question = chat[0]["content"]
|
407
417
|
if image:
|
@@ -449,31 +459,42 @@ class VisionAgent(Agent):
|
|
449
459
|
self.answer_model, question, answers, reflections
|
450
460
|
)
|
451
461
|
|
452
|
-
|
453
|
-
all_tool_results.append({"
|
462
|
+
visualized_output = visualize_result(all_tool_results)
|
463
|
+
all_tool_results.append({"visualized_output": visualized_output})
|
454
464
|
reflection = self_reflect(
|
455
465
|
self.reflect_model,
|
456
466
|
question,
|
457
467
|
self.tools,
|
458
468
|
all_tool_results,
|
459
469
|
final_answer,
|
460
|
-
|
470
|
+
visualized_output[0] if len(visualized_output) > 0 else image,
|
461
471
|
)
|
462
472
|
self.log_progress(f"Reflection: {reflection}")
|
463
473
|
if parse_reflect(reflection):
|
464
474
|
break
|
465
475
|
else:
|
466
|
-
reflections += reflection
|
467
|
-
# '<
|
476
|
+
reflections += "\n" + reflection
|
477
|
+
# '<END>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
468
478
|
self.log_progress(
|
469
479
|
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</<ANSWER>"
|
470
480
|
)
|
481
|
+
|
482
|
+
if visualize_output:
|
483
|
+
visualized_output = all_tool_results[-1]["visualized_output"]
|
484
|
+
for image in visualized_output:
|
485
|
+
Image.open(image).show()
|
486
|
+
|
471
487
|
return final_answer, all_tool_results
|
472
488
|
|
473
489
|
def chat(
|
474
|
-
self,
|
490
|
+
self,
|
491
|
+
chat: List[Dict[str, str]],
|
492
|
+
image: Optional[Union[str, Path]] = None,
|
493
|
+
visualize_output: Optional[bool] = False,
|
475
494
|
) -> str:
|
476
|
-
answer, _ = self.chat_with_workflow(
|
495
|
+
answer, _ = self.chat_with_workflow(
|
496
|
+
chat, image=image, visualize_output=visualize_output
|
497
|
+
)
|
477
498
|
return answer
|
478
499
|
|
479
500
|
def retrieval(
|
@@ -1,4 +1,4 @@
|
|
1
|
-
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise,
|
1
|
+
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, concrete plan that aims to mitigate the same failure with the tools available. Do not make vague steps like re-evaluate the threshold, instead make concrete steps like use a threshold of 0.5 or whatever threshold you think would fix this issue. If the task cannot be completed with the existing tools, respond with Finish. Use complete sentences.
|
2
2
|
|
3
3
|
User's question: {question}
|
4
4
|
|
@@ -49,7 +49,6 @@ Output: """
|
|
49
49
|
|
50
50
|
CHOOSE_TOOL = """This is the user's question: {question}
|
51
51
|
These are the tools you can select to solve the question:
|
52
|
-
|
53
52
|
{tools}
|
54
53
|
|
55
54
|
Please note that:
|
@@ -63,7 +62,6 @@ Output: """
|
|
63
62
|
|
64
63
|
CHOOSE_TOOL_DEPENDS = """This is the user's question: {question}
|
65
64
|
These are the tools you can select to solve the question:
|
66
|
-
|
67
65
|
{tools}
|
68
66
|
|
69
67
|
This is a reflection from a previous failed attempt:
|
File without changes
|
Binary file
|
vision_agent/image_utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Utility functions for image processing."""
|
2
2
|
|
3
3
|
import base64
|
4
|
+
from importlib import resources
|
4
5
|
from io import BytesIO
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import Dict, Tuple, Union
|
@@ -104,19 +105,28 @@ def overlay_bboxes(
|
|
104
105
|
|
105
106
|
color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
|
106
107
|
|
107
|
-
draw = ImageDraw.Draw(image)
|
108
|
-
font = ImageFont.load_default()
|
109
108
|
width, height = image.size
|
109
|
+
fontsize = max(12, int(min(width, height) / 40))
|
110
|
+
draw = ImageDraw.Draw(image)
|
111
|
+
font = ImageFont.truetype(
|
112
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
113
|
+
fontsize,
|
114
|
+
)
|
110
115
|
if "bboxes" not in bboxes:
|
111
116
|
return image.convert("RGB")
|
112
117
|
|
113
|
-
for label, box in zip(bboxes["labels"], bboxes["bboxes"]):
|
114
|
-
box = [
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
118
|
+
for label, box, scores in zip(bboxes["labels"], bboxes["bboxes"], bboxes["scores"]):
|
119
|
+
box = [
|
120
|
+
int(box[0] * width),
|
121
|
+
int(box[1] * height),
|
122
|
+
int(box[2] * width),
|
123
|
+
int(box[3] * height),
|
124
|
+
]
|
125
|
+
draw.rectangle(box, outline=color[label], width=4)
|
126
|
+
text = f"{label}: {scores:.2f}"
|
127
|
+
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
128
|
+
draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
|
129
|
+
draw.text((box[0], box[1]), text, fill="black", font=font)
|
120
130
|
return image.convert("RGB")
|
121
131
|
|
122
132
|
|
@@ -138,7 +148,9 @@ def overlay_masks(
|
|
138
148
|
elif isinstance(image, np.ndarray):
|
139
149
|
image = Image.fromarray(image)
|
140
150
|
|
141
|
-
color = {
|
151
|
+
color = {
|
152
|
+
label: COLORS[i % len(COLORS)] for i, label in enumerate(set(masks["labels"]))
|
153
|
+
}
|
142
154
|
if "masks" not in masks:
|
143
155
|
return image.convert("RGB")
|
144
156
|
|
vision_agent/tools/tools.py
CHANGED
@@ -53,9 +53,7 @@ class Tool(ABC):
|
|
53
53
|
|
54
54
|
class NoOp(Tool):
|
55
55
|
name = "noop_"
|
56
|
-
description =
|
57
|
-
"'noop_' is a no-op tool that does nothing if you do not need to use a tool."
|
58
|
-
)
|
56
|
+
description = "'noop_' is a no-op tool that does nothing if you do not want answer the question directly and not use a tool."
|
59
57
|
usage = {
|
60
58
|
"required_parameters": [],
|
61
59
|
"examples": [
|
@@ -85,7 +83,7 @@ class CLIP(Tool):
|
|
85
83
|
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
86
84
|
|
87
85
|
name = "clip_"
|
88
|
-
description = "'clip_' is a tool that can classify
|
86
|
+
description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores."
|
89
87
|
usage = {
|
90
88
|
"required_parameters": [
|
91
89
|
{"name": "prompt", "type": "str"},
|
@@ -163,7 +161,7 @@ class GroundingDINO(Tool):
|
|
163
161
|
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
164
162
|
|
165
163
|
name = "grounding_dino_"
|
166
|
-
description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
|
164
|
+
description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and associated probability scores."
|
167
165
|
usage = {
|
168
166
|
"required_parameters": [
|
169
167
|
{"name": "prompt", "type": "str"},
|
@@ -179,8 +177,11 @@ class GroundingDINO(Tool):
|
|
179
177
|
"parameters": {"prompt": "car", "image": ""},
|
180
178
|
},
|
181
179
|
{
|
182
|
-
"scenario": "Can you detect the person on the left? Image name: person.jpg",
|
183
|
-
"parameters": {
|
180
|
+
"scenario": "Can you detect the person on the left and right? Image name: person.jpg",
|
181
|
+
"parameters": {
|
182
|
+
"prompt": "left person. right person",
|
183
|
+
"image": "person.jpg",
|
184
|
+
},
|
184
185
|
},
|
185
186
|
{
|
186
187
|
"scenario": "Detect the red shirts and green shirst. Image name: shirts.jpg",
|
@@ -269,7 +270,7 @@ class GroundingSAM(Tool):
|
|
269
270
|
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
270
271
|
|
271
272
|
name = "grounding_sam_"
|
272
|
-
description = "'grounding_sam_' is a tool that can detect
|
273
|
+
description = "'grounding_sam_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
273
274
|
usage = {
|
274
275
|
"required_parameters": [
|
275
276
|
{"name": "prompt", "type": "str"},
|
@@ -285,8 +286,11 @@ class GroundingSAM(Tool):
|
|
285
286
|
"parameters": {"prompt": "car", "image": ""},
|
286
287
|
},
|
287
288
|
{
|
288
|
-
"scenario": "Can you segment the person on the left? Image name: person.jpg",
|
289
|
-
"parameters": {
|
289
|
+
"scenario": "Can you segment the person on the left and right? Image name: person.jpg",
|
290
|
+
"parameters": {
|
291
|
+
"prompt": "left person. right person",
|
292
|
+
"image": "person.jpg",
|
293
|
+
},
|
290
294
|
},
|
291
295
|
{
|
292
296
|
"scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
|
@@ -370,8 +374,9 @@ class AgentGroundingSAM(GroundingSAM):
|
|
370
374
|
mask_files = []
|
371
375
|
for mask in rets["masks"]:
|
372
376
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
373
|
-
|
374
|
-
|
377
|
+
file_name = Path(tmp.name).with_suffix(".mask.png")
|
378
|
+
Image.fromarray(mask * 255).save(file_name)
|
379
|
+
mask_files.append(str(file_name))
|
375
380
|
rets["masks"] = mask_files
|
376
381
|
return rets
|
377
382
|
|
@@ -380,7 +385,7 @@ class Counter(Tool):
|
|
380
385
|
r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
|
381
386
|
|
382
387
|
name = "counter_"
|
383
|
-
description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression."
|
388
|
+
description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression. It returns a dictionary containing the labels and their counts."
|
384
389
|
usage = {
|
385
390
|
"required_parameters": [
|
386
391
|
{"name": "prompt", "type": "str"},
|
@@ -400,14 +405,14 @@ class Counter(Tool):
|
|
400
405
|
|
401
406
|
def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
|
402
407
|
resp = GroundingDINO()(prompt, image)
|
403
|
-
return dict(CounterClass(resp[
|
408
|
+
return dict(CounterClass(resp["labels"]))
|
404
409
|
|
405
410
|
|
406
411
|
class Crop(Tool):
|
407
412
|
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
|
408
413
|
|
409
414
|
name = "crop_"
|
410
|
-
description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image."
|
415
|
+
description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image."
|
411
416
|
usage = {
|
412
417
|
"required_parameters": [
|
413
418
|
{"name": "bbox", "type": "List[float]"},
|
@@ -495,9 +500,7 @@ class SegArea(Tool):
|
|
495
500
|
|
496
501
|
class BboxIoU(Tool):
|
497
502
|
name = "bbox_iou_"
|
498
|
-
description =
|
499
|
-
"'bbox_iou_' returns the intersection over union of two bounding boxes."
|
500
|
-
)
|
503
|
+
description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping."
|
501
504
|
usage = {
|
502
505
|
"required_parameters": [
|
503
506
|
{"name": "bbox1", "type": "List[int]"},
|
@@ -591,85 +594,35 @@ class ExtractFrames(Tool):
|
|
591
594
|
)
|
592
595
|
for frame, ts in frames:
|
593
596
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
594
|
-
|
595
|
-
|
597
|
+
file_name = Path(tmp.name).with_suffix(".frame.png")
|
598
|
+
Image.fromarray(frame).save(file_name)
|
599
|
+
result.append((str(file_name), ts))
|
596
600
|
return result
|
597
601
|
|
598
602
|
|
599
|
-
class
|
600
|
-
r"""
|
601
|
-
|
602
|
-
name = "add_"
|
603
|
-
description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."
|
604
|
-
usage = {
|
605
|
-
"required_parameters": [{"name": "input", "type": "List[int]"}],
|
606
|
-
"examples": [
|
607
|
-
{
|
608
|
-
"scenario": "If you want to calculate 2 + 4",
|
609
|
-
"parameters": {"input": [2, 4]},
|
610
|
-
}
|
611
|
-
],
|
612
|
-
}
|
613
|
-
|
614
|
-
def __call__(self, input: List[int]) -> float:
|
615
|
-
return round(sum(input), 2)
|
616
|
-
|
617
|
-
|
618
|
-
class Subtract(Tool):
|
619
|
-
r"""Subtract returns the difference of all the arguments passed to it, normalized to 2 decimal places."""
|
620
|
-
|
621
|
-
name = "subtract_"
|
622
|
-
description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places."
|
623
|
-
usage = {
|
624
|
-
"required_parameters": [{"name": "input", "type": "List[int]"}],
|
625
|
-
"examples": [
|
626
|
-
{
|
627
|
-
"scenario": "If you want to calculate 4 - 2",
|
628
|
-
"parameters": {"input": [4, 2]},
|
629
|
-
}
|
630
|
-
],
|
631
|
-
}
|
632
|
-
|
633
|
-
def __call__(self, input: List[int]) -> float:
|
634
|
-
return round(input[0] - input[1], 2)
|
635
|
-
|
603
|
+
class Calculator(Tool):
|
604
|
+
r"""Calculator is a tool that can perform basic arithmetic operations."""
|
636
605
|
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."
|
606
|
+
name = "calculator_"
|
607
|
+
description = (
|
608
|
+
"'calculator_' is a tool that can perform basic arithmetic operations."
|
609
|
+
)
|
642
610
|
usage = {
|
643
|
-
"required_parameters": [{"name": "
|
611
|
+
"required_parameters": [{"name": "equation", "type": "str"}],
|
644
612
|
"examples": [
|
645
613
|
{
|
646
|
-
"scenario": "If you want to calculate 2 * 4",
|
647
|
-
"parameters": {"
|
648
|
-
}
|
649
|
-
],
|
650
|
-
}
|
651
|
-
|
652
|
-
def __call__(self, input: List[int]) -> float:
|
653
|
-
return round(input[0] * input[1], 2)
|
654
|
-
|
655
|
-
|
656
|
-
class Divide(Tool):
|
657
|
-
r"""Divide returns the division of all the arguments passed to it, normalized to 2 decimal places."""
|
658
|
-
|
659
|
-
name = "divide_"
|
660
|
-
description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places."
|
661
|
-
usage = {
|
662
|
-
"required_parameters": [{"name": "input", "type": "List[int]"}],
|
663
|
-
"examples": [
|
614
|
+
"scenario": "If you want to calculate (2 * 3) + 4",
|
615
|
+
"parameters": {"equation": "2 + 4"},
|
616
|
+
},
|
664
617
|
{
|
665
|
-
"scenario": "If you want to calculate 4 / 2",
|
666
|
-
"parameters": {"
|
667
|
-
}
|
618
|
+
"scenario": "If you want to calculate (4 + 2.5) / 2.1",
|
619
|
+
"parameters": {"equation": "(4 + 2.5) / 2.1"},
|
620
|
+
},
|
668
621
|
],
|
669
622
|
}
|
670
623
|
|
671
|
-
def __call__(self,
|
672
|
-
return round(
|
624
|
+
def __call__(self, equation: str) -> float:
|
625
|
+
return cast(float, round(eval(equation), 2))
|
673
626
|
|
674
627
|
|
675
628
|
TOOLS = {
|
@@ -687,10 +640,7 @@ TOOLS = {
|
|
687
640
|
SegArea,
|
688
641
|
BboxIoU,
|
689
642
|
SegIoU,
|
690
|
-
|
691
|
-
Subtract,
|
692
|
-
Multiply,
|
693
|
-
Divide,
|
643
|
+
Calculator,
|
694
644
|
]
|
695
645
|
)
|
696
646
|
if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.52
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -103,7 +103,8 @@ the individual steps and tools to get the answer:
|
|
103
103
|
}
|
104
104
|
]],
|
105
105
|
"answer": "The jar is located at [0.58, 0.2, 0.72, 0.45].",
|
106
|
-
}
|
106
|
+
},
|
107
|
+
{"visualize_output": "final_output.png"}]
|
107
108
|
```
|
108
109
|
|
109
110
|
### Tools
|
@@ -5,22 +5,24 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=dYzWa_RaiaFSQ-CowoQOcFmjZtBTTljRyA809bLgrvU,4519
|
6
6
|
vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
9
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=TKseWK3C7kr9GmjQmYgNSBZJHPqd7wTP6BSkwYqJkdY,19765
|
9
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=dPg0mLVK_fGJpYK2xXGhm-zuXX1KVZW_zFXyYsspUz8,6567
|
10
10
|
vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
|
11
11
|
vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
|
12
12
|
vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
|
13
13
|
vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
|
14
|
-
vision_agent/
|
14
|
+
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
16
|
+
vision_agent/image_utils.py,sha256=hFdPoRmeVU5jErFr5xaagMQ6Wy7Xbw8H8HXuLGdJIAM,4786
|
15
17
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
16
18
|
vision_agent/llm/llm.py,sha256=tgL6ZtuwZKuxSNiCxJCuP2ETjNMrosdgxXkZJb0_00E,5024
|
17
19
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
18
20
|
vision_agent/lmm/lmm.py,sha256=LxwxCArp7DfnPbjf_Gl55xBxPwo2Qx8eDp1gCnGYSO0,9535
|
19
21
|
vision_agent/tools/__init__.py,sha256=AKN-T659HpwVearRnkCd6wWNoJ6K5kW9gAZwb8IQSLE,235
|
20
22
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=h11niI1JiOCOaOFkdHee_AnXegaIK5Al8QMoFdZaJuo,24583
|
22
24
|
vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
|
23
|
-
vision_agent-0.0.
|
24
|
-
vision_agent-0.0.
|
25
|
-
vision_agent-0.0.
|
26
|
-
vision_agent-0.0.
|
25
|
+
vision_agent-0.0.52.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
26
|
+
vision_agent-0.0.52.dist-info/METADATA,sha256=5OBmHCpSDZbvGb_pNU_cOKWI9AdUOhEufDHigk_cm3c,6184
|
27
|
+
vision_agent-0.0.52.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
28
|
+
vision_agent-0.0.52.dist-info/RECORD,,
|
File without changes
|
File without changes
|