vision-agent 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -366,6 +366,20 @@ def _handle_viz_tools(
366
366
  return image_to_data
367
367
 
368
368
 
369
+ def sample_n_evenly_spaced(lst: Sequence, n: int) -> Sequence:
370
+ if n <= 0:
371
+ return []
372
+ elif len(lst) == 0:
373
+ return []
374
+ elif n == 1:
375
+ return [lst[0]]
376
+ elif n >= len(lst):
377
+ return lst
378
+
379
+ spacing = (len(lst) - 1) / (n - 1)
380
+ return [lst[round(spacing * i)] for i in range(n)]
381
+
382
+
369
383
  def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]:
370
384
  image_to_data: Dict[str, Dict] = {}
371
385
  for tool_result in all_tool_results:
@@ -377,6 +391,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
377
391
  "dinov_",
378
392
  "zero_shot_counting_",
379
393
  "visual_prompt_counting_",
394
+ "ocr_",
380
395
  ]:
381
396
  continue
382
397
 
@@ -523,20 +538,20 @@ class VisionAgent(Agent):
523
538
  if image:
524
539
  question += f" Image name: {image}"
525
540
  if reference_data:
526
- if not (
527
- "image" in reference_data
528
- and ("mask" in reference_data or "bbox" in reference_data)
529
- ):
530
- raise ValueError(
531
- f"Reference data must contain 'image' and a visual prompt which can be 'mask' or 'bbox'. but got {reference_data}"
532
- )
533
- visual_prompt_data = (
534
- f"Reference mask: {reference_data['mask']}"
541
+ question += (
542
+ f" Reference image: {reference_data['image']}"
543
+ if "image" in reference_data
544
+ else ""
545
+ )
546
+ question += (
547
+ f" Reference mask: {reference_data['mask']}"
535
548
  if "mask" in reference_data
536
- else f"Reference bbox: {reference_data['bbox']}"
549
+ else ""
537
550
  )
538
551
  question += (
539
- f" Reference image: {reference_data['image']}, {visual_prompt_data}"
552
+ f" Reference bbox: {reference_data['bbox']}"
553
+ if "bbox" in reference_data
554
+ else ""
540
555
  )
541
556
 
542
557
  reflections = ""
@@ -583,7 +598,7 @@ class VisionAgent(Agent):
583
598
  visualized_output = visualize_result(all_tool_results)
584
599
  all_tool_results.append({"visualized_output": visualized_output})
585
600
  if len(visualized_output) > 0:
586
- reflection_images = visualized_output
601
+ reflection_images = sample_n_evenly_spaced(visualized_output, 3)
587
602
  elif image is not None:
588
603
  reflection_images = [image]
589
604
  else:
vision_agent/llm/llm.py CHANGED
@@ -131,6 +131,11 @@ class OpenAILLM(LLM):
131
131
  def generate_zero_shot_counter(self, question: str) -> Callable:
132
132
  return lambda x: ZeroShotCounting()(**{"image": x})
133
133
 
134
+ def generate_image_qa_tool(self, question: str) -> Callable:
135
+ from vision_agent.tools import ImageQuestionAnswering
136
+
137
+ return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
138
+
134
139
 
135
140
  class AzureOpenAILLM(OpenAILLM):
136
141
  def __init__(
vision_agent/lmm/lmm.py CHANGED
@@ -9,14 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, cast
9
9
  import requests
10
10
  from openai import AzureOpenAI, OpenAI
11
11
 
12
- from vision_agent.tools import (
13
- CHOOSE_PARAMS,
14
- CLIP,
15
- SYSTEM_PROMPT,
16
- GroundingDINO,
17
- GroundingSAM,
18
- ZeroShotCounting,
19
- )
12
+ from vision_agent.tools import CHOOSE_PARAMS, SYSTEM_PROMPT
20
13
 
21
14
  _LOGGER = logging.getLogger(__name__)
22
15
 
@@ -205,6 +198,8 @@ class OpenAILMM(LMM):
205
198
  return cast(str, response.choices[0].message.content)
206
199
 
207
200
  def generate_classifier(self, question: str) -> Callable:
201
+ from vision_agent.tools import CLIP
202
+
208
203
  api_doc = CLIP.description + "\n" + str(CLIP.usage)
209
204
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
210
205
  response = self.client.chat.completions.create(
@@ -228,6 +223,8 @@ class OpenAILMM(LMM):
228
223
  return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
229
224
 
230
225
  def generate_detector(self, question: str) -> Callable:
226
+ from vision_agent.tools import GroundingDINO
227
+
231
228
  api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
232
229
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
233
230
  response = self.client.chat.completions.create(
@@ -251,6 +248,8 @@ class OpenAILMM(LMM):
251
248
  return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
252
249
 
253
250
  def generate_segmentor(self, question: str) -> Callable:
251
+ from vision_agent.tools import GroundingSAM
252
+
254
253
  api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
255
254
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
256
255
  response = self.client.chat.completions.create(
@@ -274,8 +273,15 @@ class OpenAILMM(LMM):
274
273
  return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
275
274
 
276
275
  def generate_zero_shot_counter(self, question: str) -> Callable:
276
+ from vision_agent.tools import ZeroShotCounting
277
+
277
278
  return lambda x: ZeroShotCounting()(**{"image": x})
278
279
 
280
+ def generate_image_qa_tool(self, question: str) -> Callable:
281
+ from vision_agent.tools import ImageQuestionAnswering
282
+
283
+ return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
284
+
279
285
 
280
286
  class AzureOpenAILMM(OpenAILMM):
281
287
  def __init__(
@@ -1,6 +1,7 @@
1
1
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
2
2
  from .tools import ( # Counter,
3
3
  CLIP,
4
+ OCR,
4
5
  TOOLS,
5
6
  BboxArea,
6
7
  BboxIoU,
@@ -11,9 +12,12 @@ from .tools import ( # Counter,
11
12
  GroundingDINO,
12
13
  GroundingSAM,
13
14
  ImageCaption,
14
- ZeroShotCounting,
15
- VisualPromptCounting,
15
+ ImageQuestionAnswering,
16
16
  SegArea,
17
17
  SegIoU,
18
18
  Tool,
19
+ VisualPromptCounting,
20
+ VisualQuestionAnswering,
21
+ ZeroShotCounting,
22
+ register_tool,
19
23
  )
@@ -1,8 +1,9 @@
1
+ import io
1
2
  import logging
2
3
  import tempfile
3
4
  from abc import ABC
4
5
  from pathlib import Path
5
- from typing import Any, Dict, List, Tuple, Union, cast
6
+ from typing import Any, Dict, List, Tuple, Type, Union, cast
6
7
 
7
8
  import numpy as np
8
9
  import requests
@@ -11,11 +12,12 @@ from PIL.Image import Image as ImageType
11
12
 
12
13
  from vision_agent.image_utils import (
13
14
  convert_to_b64,
15
+ denormalize_bbox,
14
16
  get_image_size,
15
- rle_decode,
16
17
  normalize_bbox,
17
- denormalize_bbox,
18
+ rle_decode,
18
19
  )
20
+ from vision_agent.lmm import OpenAILMM
19
21
  from vision_agent.tools.video import extract_frames_from_video
20
22
  from vision_agent.type_defs import LandingaiAPIKey
21
23
 
@@ -29,6 +31,9 @@ class Tool(ABC):
29
31
  description: str
30
32
  usage: Dict
31
33
 
34
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
35
+ raise NotImplementedError
36
+
32
37
 
33
38
  class NoOp(Tool):
34
39
  name = "noop_"
@@ -498,7 +503,7 @@ class ZeroShotCounting(Tool):
498
503
 
499
504
  # TODO: Add support for input multiple images, which aligns with the output type.
500
505
  def __call__(self, image: Union[str, ImageType]) -> Dict:
501
- """Invoke the Image captioning model.
506
+ """Invoke the Zero shot counting model.
502
507
 
503
508
  Parameters:
504
509
  image: the input image.
@@ -562,7 +567,7 @@ class VisualPromptCounting(Tool):
562
567
 
563
568
  # TODO: Add support for input multiple images, which aligns with the output type.
564
569
  def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
565
- """Invoke the Image captioning model.
570
+ """Invoke the few shot counting model.
566
571
 
567
572
  Parameters:
568
573
  image: the input image.
@@ -583,6 +588,144 @@ class VisualPromptCounting(Tool):
583
588
  return _send_inference_request(data, "tools")
584
589
 
585
590
 
591
+ class VisualQuestionAnswering(Tool):
592
+ r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
593
+
594
+ Example
595
+ -------
596
+ >>> import vision_agent as va
597
+ >>> vqa_tool = va.tools.VisualQuestionAnswering()
598
+ >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
599
+ {'text': "The image contains a cat sitting on a table with a bowl of milk."}
600
+ """
601
+
602
+ name = "visual_question_answering_"
603
+ description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
604
+
605
+ usage = {
606
+ "required_parameters": [
607
+ {"name": "image", "type": "str"},
608
+ {"name": "prompt", "type": "str"},
609
+ ],
610
+ "examples": [
611
+ {
612
+ "scenario": "Describe this image in detail. Image name: cat.jpg",
613
+ "parameters": {
614
+ "image": "cats.jpg",
615
+ "prompt": "Describe this image in detail",
616
+ },
617
+ },
618
+ {
619
+ "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
620
+ "parameters": {
621
+ "image": "sign.jpg",
622
+ "prompt": "Can you help me with this street sign ? What does it say ?",
623
+ },
624
+ },
625
+ {
626
+ "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
627
+ "parameters": {
628
+ "image": "weather.jpg",
629
+ "prompt": "Describe the weather in the image for me ",
630
+ },
631
+ },
632
+ {
633
+ "scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
634
+ "parameters": {
635
+ "image": "chart.jpg",
636
+ "prompt": "Which 2 are the least frequent bins in this histogram",
637
+ },
638
+ },
639
+ ],
640
+ }
641
+
642
+ def __call__(self, image: str, prompt: str) -> Dict:
643
+ """Invoke the visual question answering model.
644
+
645
+ Parameters:
646
+ image: the input image.
647
+
648
+ Returns:
649
+ A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
650
+ """
651
+
652
+ gpt = OpenAILMM()
653
+ return {"text": gpt(input=prompt, images=[image])}
654
+
655
+
656
+ class ImageQuestionAnswering(Tool):
657
+ r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
658
+ It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
659
+ It is also useful if the user wants the data to be not exposed to OpenAI endpoints
660
+
661
+ Example
662
+ -------
663
+ >>> import vision_agent as va
664
+ >>> vqa_tool = va.tools.ImageQuestionAnswering()
665
+ >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
666
+ {'text': "The image contains a cat sitting on a table with a bowl of milk."}
667
+ """
668
+
669
+ name = "image_question_answering_"
670
+ description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
671
+
672
+ usage = {
673
+ "required_parameters": [
674
+ {"name": "image", "type": "str"},
675
+ {"name": "prompt", "type": "str"},
676
+ ],
677
+ "examples": [
678
+ {
679
+ "scenario": "Describe this image in detail. Image name: cat.jpg",
680
+ "parameters": {
681
+ "image": "cats.jpg",
682
+ "prompt": "Describe this image in detail",
683
+ },
684
+ },
685
+ {
686
+ "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
687
+ "parameters": {
688
+ "image": "sign.jpg",
689
+ "prompt": "Can you help me with this street sign ? What does it say ?",
690
+ },
691
+ },
692
+ {
693
+ "scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
694
+ "parameters": {
695
+ "image": "weather.jpg",
696
+ "prompt": "Describe the weather in the image for me ",
697
+ },
698
+ },
699
+ {
700
+ "scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
701
+ "parameters": {
702
+ "image": "chart.jpg",
703
+ "prompt": "Which 2 are the least frequent bins in this histogram",
704
+ },
705
+ },
706
+ ],
707
+ }
708
+
709
+ def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
710
+ """Invoke the visual question answering model.
711
+
712
+ Parameters:
713
+ image: the input image.
714
+
715
+ Returns:
716
+ A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
717
+ """
718
+
719
+ image_b64 = convert_to_b64(image)
720
+ data = {
721
+ "image": image_b64,
722
+ "prompt": prompt,
723
+ "tool": "image_question_answering",
724
+ }
725
+
726
+ return _send_inference_request(data, "tools")
727
+
728
+
586
729
  class Crop(Tool):
587
730
  r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
588
731
 
@@ -858,6 +1001,57 @@ class ExtractFrames(Tool):
858
1001
  return result
859
1002
 
860
1003
 
1004
+ class OCR(Tool):
1005
+ name = "ocr_"
1006
+ description = "'ocr_' extracts text from an image."
1007
+ usage = {
1008
+ "required_parameters": [
1009
+ {"name": "image", "type": "str"},
1010
+ ],
1011
+ "examples": [
1012
+ {
1013
+ "scenario": "Can you extract the text from this image? Image name: image.png",
1014
+ "parameters": {"image": "image.png"},
1015
+ },
1016
+ ],
1017
+ }
1018
+ _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
1019
+ _URL = "https://app.landing.ai/ocr/v1/detect-text"
1020
+
1021
+ def __call__(self, image: str) -> dict:
1022
+ pil_image = Image.open(image).convert("RGB")
1023
+ image_size = pil_image.size[::-1]
1024
+ image_buffer = io.BytesIO()
1025
+ pil_image.save(image_buffer, format="PNG")
1026
+ buffer_bytes = image_buffer.getvalue()
1027
+ image_buffer.close()
1028
+
1029
+ res = requests.post(
1030
+ self._URL,
1031
+ files={"images": buffer_bytes},
1032
+ data={"language": "en"},
1033
+ headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
1034
+ )
1035
+ if res.status_code != 200:
1036
+ _LOGGER.error(f"Request failed: {res.text}")
1037
+ raise ValueError(f"Request failed: {res.text}")
1038
+
1039
+ data = res.json()
1040
+ output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
1041
+ for det in data[0]:
1042
+ output["labels"].append(det["text"])
1043
+ box = [
1044
+ det["location"][0]["x"],
1045
+ det["location"][0]["y"],
1046
+ det["location"][2]["x"],
1047
+ det["location"][2]["y"],
1048
+ ]
1049
+ box = normalize_bbox(box, image_size)
1050
+ output["bboxes"].append(box)
1051
+ output["scores"].append(round(det["score"], 2))
1052
+ return output
1053
+
1054
+
861
1055
  class Calculator(Tool):
862
1056
  r"""Calculator is a tool that can perform basic arithmetic operations."""
863
1057
 
@@ -889,11 +1083,11 @@ TOOLS = {
889
1083
  [
890
1084
  NoOp,
891
1085
  CLIP,
892
- ImageCaption,
893
1086
  GroundingDINO,
894
1087
  AgentGroundingSAM,
895
1088
  ZeroShotCounting,
896
1089
  VisualPromptCounting,
1090
+ VisualQuestionAnswering,
897
1091
  AgentDINOv,
898
1092
  ExtractFrames,
899
1093
  Crop,
@@ -903,6 +1097,7 @@ TOOLS = {
903
1097
  SegIoU,
904
1098
  BboxContains,
905
1099
  BoxDistance,
1100
+ OCR,
906
1101
  Calculator,
907
1102
  ]
908
1103
  )
@@ -910,6 +1105,31 @@ TOOLS = {
910
1105
  }
911
1106
 
912
1107
 
1108
+ def register_tool(tool: Type[Tool]) -> Type[Tool]:
1109
+ r"""Add a tool to the list of available tools.
1110
+
1111
+ Parameters:
1112
+ tool: The tool to add.
1113
+ """
1114
+
1115
+ if (
1116
+ not hasattr(tool, "name")
1117
+ or not hasattr(tool, "description")
1118
+ or not hasattr(tool, "usage")
1119
+ ):
1120
+ raise ValueError(
1121
+ "The tool must have 'name', 'description' and 'usage' attributes."
1122
+ )
1123
+
1124
+ TOOLS[len(TOOLS)] = {
1125
+ "name": tool.name,
1126
+ "description": tool.description,
1127
+ "usage": tool.usage,
1128
+ "class": tool,
1129
+ }
1130
+ return tool
1131
+
1132
+
913
1133
  def _send_inference_request(
914
1134
  payload: Dict[str, Any], endpoint_name: str
915
1135
  ) -> Dict[str, Any]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -58,7 +58,7 @@ pip install vision-agent
58
58
  ```
59
59
 
60
60
  Ensure you have an OpenAI API key and set it as an environment variable (if you are
61
- using Azure OpenAI please see the additional setup section):
61
+ using Azure OpenAI please see the Azure setup section):
62
62
 
63
63
  ```bash
64
64
  export OPENAI_API_KEY="your-api-key"
@@ -123,6 +123,31 @@ you. For example:
123
123
  }]
124
124
  ```
125
125
 
126
+ #### Custom Tools
127
+ You can also add your own custom tools for your vision agent to use:
128
+
129
+ ```python
130
+ >>> from vision_agent.tools import Tool, register_tool
131
+ >>> @register_tool
132
+ >>> class NumItems(Tool):
133
+ >>> name = "num_items_"
134
+ >>> description = "Returns the number of items in a list."
135
+ >>> usage = {
136
+ >>> "required_parameters": [{"name": "prompt", "type": "list"}],
137
+ >>> "examples": [
138
+ >>> {
139
+ >>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
140
+ >>> "parameters": {"prompt": "['a', 'b', 'c']"},
141
+ >>> }
142
+ >>> ],
143
+ >>> }
144
+ >>> def __call__(self, prompt: list[str]) -> int:
145
+ >>> return len(prompt)
146
+ ```
147
+ This will register it with the list of tools Vision Agent has access to. It will be able
148
+ to pick it based on the tool description and use it based on the usage provided.
149
+
150
+ #### Tool List
126
151
  | Tool | Description |
127
152
  | --- | --- |
128
153
  | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
@@ -141,11 +166,12 @@ you. For example:
141
166
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
142
167
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
143
168
  | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
169
+ | OCR | OCR returns the text detected in an image along with the location. |
144
170
 
145
171
 
146
172
  It also has a basic set of calculate tools such as add, subtract, multiply and divide.
147
173
 
148
- ### Additional Setup
174
+ ### Azure Setup
149
175
  If you want to use Azure OpenAI models, you can set the environment variable:
150
176
 
151
177
  ```bash
@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
5
5
  vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
6
6
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=6AtVaEQL0ksg1QkUBn_YhytYjRfH7-M4q7G6pnds9Ds,25002
8
+ vision_agent/agent/vision_agent.py,sha256=SFdw6OBqWj0cr-YthFMM_x-Urg86CggazYQG4wy0n-U,25195
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
10
10
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
12
12
  vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
13
13
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
14
- vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
14
+ vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
15
15
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
16
- vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
17
- vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
16
+ vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
17
+ vision_agent/tools/__init__.py,sha256=HfUr0JQUwk0Kyieen93df9lMbbdpVf9Q6CcVFmKv_q4,413
18
18
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
19
- vision_agent/tools/tools.py,sha256=Cwh7GNSnCYxyKKgusHlf-Cqd9NBjlbZG7d-GauQJCwI,34751
19
+ vision_agent/tools/tools.py,sha256=GvRDLeMVS9C7z56hlSpThGoV0r_x5pKSFw-g4JW_qnw,42779
20
20
  vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
21
21
  vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
22
- vision_agent-0.2.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- vision_agent-0.2.3.dist-info/METADATA,sha256=cQnQTRlWBxf0aVwsMoJS4TiiAtN3SbU00nlCrbNNb9w,6748
24
- vision_agent-0.2.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
- vision_agent-0.2.3.dist-info/RECORD,,
22
+ vision_agent-0.2.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ vision_agent-0.2.5.dist-info/METADATA,sha256=zSTYpM893hERFpO2j7-YdRmRPKeGI6-qU_wkq5MitFY,7697
24
+ vision_agent-0.2.5.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
+ vision_agent-0.2.5.dist-info/RECORD,,