vision-agent 0.1.6__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vision_agent/__init__.py CHANGED
@@ -1,5 +1,3 @@
1
1
  from .agent import Agent
2
- from .data import DataStore, build_data_store
3
- from .emb import Embedder, OpenAIEmb, SentenceTransformerEmb, get_embedder
4
2
  from .llm import LLM, OpenAILLM
5
3
  from .lmm import LMM, LLaVALMM, OpenAILMM, get_lmm
@@ -8,7 +8,12 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
8
8
  from PIL import Image
9
9
  from tabulate import tabulate
10
10
 
11
- from vision_agent.image_utils import overlay_bboxes, overlay_masks, overlay_heat_map
11
+ from vision_agent.image_utils import (
12
+ convert_to_b64,
13
+ overlay_bboxes,
14
+ overlay_heat_map,
15
+ overlay_masks,
16
+ )
12
17
  from vision_agent.llm import LLM, OpenAILLM
13
18
  from vision_agent.lmm import LMM, OpenAILMM
14
19
  from vision_agent.tools import TOOLS
@@ -481,6 +486,17 @@ class VisionAgent(Agent):
481
486
  if self.report_progress_callback:
482
487
  self.report_progress_callback(description)
483
488
 
489
+ def _report_visualization_via_callback(
490
+ self, images: Sequence[Union[str, Path]]
491
+ ) -> None:
492
+ """This is intended for streaming the visualization images via the callback to the client side."""
493
+ if self.report_progress_callback:
494
+ self.report_progress_callback("<VIZ>")
495
+ if images:
496
+ for img in images:
497
+ self.report_progress_callback(f"<IMG>{convert_to_b64(img)}</IMG>")
498
+ self.report_progress_callback("</VIZ>")
499
+
484
500
  def chat_with_workflow(
485
501
  self,
486
502
  chat: List[Dict[str, str]],
@@ -577,9 +593,12 @@ class VisionAgent(Agent):
577
593
  )
578
594
 
579
595
  if visualize_output:
580
- visualized_output = all_tool_results[-1]["visualized_output"]
581
- for image in visualized_output:
582
- Image.open(image).show()
596
+ viz_images: Sequence[Union[str, Path]] = all_tool_results[-1][
597
+ "visualized_output"
598
+ ]
599
+ self._report_visualization_via_callback(viz_images)
600
+ for img in viz_images:
601
+ Image.open(img).show()
583
602
 
584
603
  return final_answer, all_tool_results
585
604
 
@@ -4,7 +4,7 @@ import base64
4
4
  from importlib import resources
5
5
  from io import BytesIO
6
6
  from pathlib import Path
7
- from typing import Dict, Tuple, Union, List
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
9
  import numpy as np
10
10
  from PIL import Image, ImageDraw, ImageFont
@@ -108,7 +108,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
108
108
  data = Image.open(data)
109
109
  if isinstance(data, Image.Image):
110
110
  buffer = BytesIO()
111
- data.convert("RGB").save(buffer, format="JPEG")
111
+ data.convert("RGB").save(buffer, format="PNG")
112
112
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
113
113
  else:
114
114
  arr_bytes = data.tobytes()
@@ -1,15 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.1.6
3
+ Version: 0.2.2
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
7
- Requires-Python: >=3.9,<3.12
7
+ Requires-Python: >=3.9
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
- Requires-Dist: faiss-cpu (>=1.0.0,<2.0.0)
13
12
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
14
13
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
15
14
  Requires-Dist: openai (>=1.0.0,<2.0.0)
@@ -18,9 +17,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
18
17
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
19
18
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
20
19
  Requires-Dist: requests (>=2.0.0,<3.0.0)
21
- Requires-Dist: sentence-transformers (>=2.0.0,<3.0.0)
22
20
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
23
- Requires-Dist: torch (>=2.1.0,<2.2.0)
24
21
  Requires-Dist: tqdm (>=4.64.0,<5.0.0)
25
22
  Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
26
23
  Project-URL: Homepage, https://landing.ai
@@ -1,19 +1,15 @@
1
- vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
1
+ vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
2
2
  vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
3
3
  vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
4
4
  vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
5
5
  vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
6
6
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=MTxeV5_Sghqoe2aOW9EbNgiq61sVCcF3ZndJ7BZl6x0,23588
8
+ vision_agent/agent/vision_agent.py,sha256=2VUMRVI6KAnmaUK-34wrgyfSQ2DAUm4g4QQcpqa2zao,24235
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
10
- vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
11
- vision_agent/data/data.py,sha256=Z2l76OrT0GgyuN52OeJqDitUcP0q1rhfdXd1of3GsVo,5128
12
- vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
13
- vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
14
10
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
11
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
16
- vision_agent/image_utils.py,sha256=Cg4aKO1tQiETT1gdsZ50XzORBtJnBFfMG2cKJyjaY6Q,7555
12
+ vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
17
13
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
18
14
  vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
19
15
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
@@ -23,7 +19,7 @@ vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E
23
19
  vision_agent/tools/tools.py,sha256=gCjHs5vJuGNBFsnJWFT7PX3wTyfHgtrgX1Eq9vqknN0,34979
24
20
  vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
25
21
  vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
26
- vision_agent-0.1.6.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
- vision_agent-0.1.6.dist-info/METADATA,sha256=Ig2tSKyeH8a2A8xZRq72M9XnKyi4_03UM4hDiFpT-eU,6574
28
- vision_agent-0.1.6.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
- vision_agent-0.1.6.dist-info/RECORD,,
22
+ vision_agent-0.2.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ vision_agent-0.2.2.dist-info/METADATA,sha256=dOZ9KWmhuVb5wvschxYBis8x79HwgOD3MmTKqyupggg,6434
24
+ vision_agent-0.2.2.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
+ vision_agent-0.2.2.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- from .data import DataStore, build_data_store
vision_agent/data/data.py DELETED
@@ -1,142 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import uuid
4
- from pathlib import Path
5
- from typing import Callable, Dict, List, Optional, Union, cast
6
-
7
- import faiss
8
- import numpy as np
9
- import numpy.typing as npt
10
- import pandas as pd
11
- from faiss import read_index, write_index
12
- from tqdm import tqdm
13
- from typing_extensions import Self
14
-
15
- from vision_agent.emb import Embedder
16
- from vision_agent.lmm import LMM
17
-
18
- tqdm.pandas()
19
-
20
-
21
- class DataStore:
22
- r"""A class to store and manage image data along with its generated metadata from an LMM."""
23
-
24
- def __init__(self, df: pd.DataFrame):
25
- r"""Initializes the DataStore with a DataFrame containing image paths and image
26
- IDs. If the image IDs are not present, they are generated using UUID4. The
27
- DataFrame must contain an 'image_paths' column.
28
-
29
- Args:
30
- df: The DataFrame containing "image_paths" and "image_id" columns.
31
- """
32
- self.df = df
33
- self.lmm: Optional[LMM] = None
34
- self.emb: Optional[Embedder] = None
35
- self.index: Optional[faiss.IndexFlatIP] = None # type: ignore
36
- if "image_paths" not in self.df.columns:
37
- raise ValueError("image_paths column must be present in DataFrame")
38
- if "image_id" not in self.df.columns:
39
- self.df["image_id"] = [str(uuid.uuid4()) for _ in range(len(df))]
40
-
41
- def add_embedder(self, emb: Embedder) -> Self:
42
- self.emb = emb
43
- return self
44
-
45
- def add_lmm(self, lmm: LMM) -> Self:
46
- self.lmm = lmm
47
- return self
48
-
49
- def add_column(
50
- self, name: str, prompt: str, func: Optional[Callable[[str], str]] = None
51
- ) -> Self:
52
- r"""Adds a new column to the DataFrame containing the generated metadata from
53
- the LMM.
54
-
55
- Args:
56
- name: The name of the column to be added.
57
- prompt: The prompt to be used to generate the metadata.
58
- func: A Python function to be applied on the output of `lmm.generate`.
59
- Defaults to None.
60
- """
61
- if self.lmm is None:
62
- raise ValueError("LMM not set yet")
63
-
64
- self.df[name] = self.df["image_paths"].progress_apply( # type: ignore
65
- lambda x: (
66
- func(self.lmm.generate(prompt, images=[x]))
67
- if func
68
- else self.lmm.generate(prompt, images=[x])
69
- )
70
- )
71
- return self
72
-
73
- def build_index(self, target_col: str) -> Self:
74
- r"""This will generate embeddings for the `target_col` and build a searchable
75
- index over them, so next time you run search it will search over this index.
76
-
77
- Args:
78
- target_col: The column name containing the data to be indexed."""
79
- if self.emb is None:
80
- raise ValueError("Embedder not set yet")
81
-
82
- embeddings: pd.Series = self.df[target_col].progress_apply(lambda x: self.emb.embed(x)) # type: ignore
83
- embeddings_np = np.array(embeddings.tolist()).astype(np.float32)
84
- self.index = faiss.IndexFlatIP(embeddings_np.shape[1])
85
- self.index.add(embeddings_np)
86
- return self
87
-
88
- def get_embeddings(self) -> npt.NDArray[np.float32]:
89
- if self.index is None:
90
- raise ValueError("Index not built yet")
91
-
92
- ntotal = self.index.ntotal
93
- d: int = self.index.d
94
- return cast(
95
- npt.NDArray[np.float32],
96
- faiss.rev_swig_ptr(self.index.get_xb(), ntotal * d).reshape(ntotal, d),
97
- )
98
-
99
- def search(self, query: str, top_k: int = 10) -> List[Dict]:
100
- r"""Searches the index for the most similar images to the query and returns
101
- the top_k results.
102
-
103
- Args:
104
- query: The query to search for.
105
- top_k: The number of results to return. Defaults to 10."""
106
- if self.index is None:
107
- raise ValueError("Index not built yet")
108
- if self.emb is None:
109
- raise ValueError("Embedder not set yet")
110
-
111
- query_embedding: npt.NDArray[np.float32] = self.emb.embed(query)
112
- _, idx = self.index.search(query_embedding.reshape(1, -1), top_k)
113
- return cast(List[Dict], self.df.iloc[idx[0]].to_dict(orient="records"))
114
-
115
- def save(self, path: Union[str, Path]) -> None:
116
- path = Path(path)
117
- path.mkdir(parents=True)
118
- self.df.to_csv(path / "data.csv")
119
- if self.index is not None:
120
- write_index(self.index, str(path / "data.index"))
121
-
122
- @classmethod
123
- def load(cls, path: Union[str, Path]) -> DataStore:
124
- path = Path(path)
125
- df = pd.read_csv(path / "data.csv", index_col=0)
126
- ds = DataStore(df)
127
- if Path(path / "data.index").exists():
128
- ds.index = read_index(str(path / "data.index"))
129
- return ds
130
-
131
-
132
- def build_data_store(data: Union[str, Path, list[Union[str, Path]]]) -> DataStore:
133
- if isinstance(data, Path) or isinstance(data, str):
134
- data = Path(data)
135
- data_files = list(Path(data).glob("*"))
136
- elif isinstance(data, list):
137
- data_files = [Path(d) for d in data]
138
-
139
- df = pd.DataFrame()
140
- df["image_paths"] = data_files
141
- df["image_id"] = [uuid.uuid4() for _ in range(len(data_files))]
142
- return DataStore(df)
@@ -1 +0,0 @@
1
- from .emb import Embedder, OpenAIEmb, SentenceTransformerEmb, get_embedder
vision_agent/emb/emb.py DELETED
@@ -1,47 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import cast
3
-
4
- import numpy as np
5
- import numpy.typing as npt
6
-
7
-
8
- class Embedder(ABC):
9
- @abstractmethod
10
- def embed(self, text: str) -> npt.NDArray[np.float32]:
11
- pass
12
-
13
-
14
- class SentenceTransformerEmb(Embedder):
15
- def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5"):
16
- from sentence_transformers import SentenceTransformer
17
-
18
- self.model = SentenceTransformer(model_name)
19
-
20
- def embed(self, text: str) -> npt.NDArray[np.float32]:
21
- return cast(
22
- npt.NDArray[np.float32],
23
- self.model.encode([text]).flatten().astype(np.float32),
24
- )
25
-
26
-
27
- class OpenAIEmb(Embedder):
28
- def __init__(self, model_name: str = "text-embedding-3-small"):
29
- from openai import OpenAI
30
-
31
- self.client = OpenAI()
32
- self.model_name = model_name
33
-
34
- def embed(self, text: str) -> npt.NDArray[np.float32]:
35
- response = self.client.embeddings.create(input=text, model=self.model_name)
36
- return np.array(response.data[0].embedding).astype(np.float32)
37
-
38
-
39
- def get_embedder(name: str) -> Embedder:
40
- if name == "sentence-transformer":
41
- return SentenceTransformerEmb()
42
- elif name == "openai":
43
- return OpenAIEmb()
44
- else:
45
- raise ValueError(
46
- f"Unknown embedder name: {name}, currently support sentence-transformer, openai."
47
- )