vision-agent 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -149,6 +149,7 @@ def owl_v2_image(
149
149
  prompt: str,
150
150
  image: np.ndarray,
151
151
  box_threshold: float = 0.10,
152
+ fine_tune_id: Optional[str] = None,
152
153
  ) -> List[Dict[str, Any]]:
153
154
  """'owl_v2_image' is a tool that can detect and count multiple objects given a text
154
155
  prompt such as category names or referring expressions on images. The categories in
@@ -160,6 +161,8 @@ def owl_v2_image(
160
161
  image (np.ndarray): The image to ground the prompt to.
161
162
  box_threshold (float, optional): The threshold for the box detection. Defaults
162
163
  to 0.10.
164
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
165
+ fine-tuned model ID here to use it.
163
166
 
164
167
  Returns:
165
168
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -176,7 +179,38 @@ def owl_v2_image(
176
179
  {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
177
180
  ]
178
181
  """
182
+
179
183
  image_size = image.shape[:2]
184
+
185
+ if fine_tune_id is not None:
186
+ image_b64 = convert_to_b64(image)
187
+ landing_api = LandingPublicAPI()
188
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
189
+ if status is not JobStatus.SUCCEEDED:
190
+ raise FineTuneModelIsNotReady(
191
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
192
+ )
193
+
194
+ data_obj = Florence2FtRequest(
195
+ image=image_b64,
196
+ task=PromptTask.PHRASE_GROUNDING,
197
+ tool="florencev2_fine_tuning",
198
+ prompt=prompt,
199
+ fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
200
+ )
201
+ data = data_obj.model_dump(by_alias=True)
202
+ detections = send_inference_request(data, "tools", v2=False)
203
+ detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
204
+ bboxes_formatted = [
205
+ ODResponseData(
206
+ label=detections["labels"][i],
207
+ bbox=normalize_bbox(detections["bboxes"][i], image_size),
208
+ score=1.0,
209
+ )
210
+ for i in range(len(detections["bboxes"]))
211
+ ]
212
+ return [bbox.model_dump() for bbox in bboxes_formatted]
213
+
180
214
  buffer_bytes = numpy_to_bytes(image)
181
215
  files = [("image", buffer_bytes)]
182
216
  payload = {
@@ -206,10 +240,10 @@ def owl_v2_video(
206
240
  box_threshold: float = 0.10,
207
241
  ) -> List[List[Dict[str, Any]]]:
208
242
  """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
209
- objects per frame given a text prompt sucha s a category name or referring
210
- expression. The categories in text prompt are separated by commas. It returns a list
211
- of lists where each inner list contains the score, label, and bounding box of the
212
- detections for that frame.
243
+ objects indepdently per frame given a text prompt such as a category name or
244
+ referring expression but does not track objects across frames. The categories in
245
+ text prompt are separated by commas. It returns a list of lists where each inner
246
+ list contains the score, label, and bounding box of the detections for that frame.
213
247
 
214
248
  Parameters:
215
249
  prompt (str): The prompt to ground to the video.
@@ -335,7 +369,9 @@ def grounding_sam(
335
369
  return return_data
336
370
 
337
371
 
338
- def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
372
+ def florence2_sam2_image(
373
+ prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
374
+ ) -> List[Dict[str, Any]]:
339
375
  """'florence2_sam2_image' is a tool that can segment multiple objects given a text
340
376
  prompt such as category names or referring expressions. The categories in the text
341
377
  prompt are separated by commas. It returns a list of bounding boxes, label names,
@@ -344,6 +380,8 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
344
380
  Parameters:
345
381
  prompt (str): The prompt to ground to the image.
346
382
  image (np.ndarray): The image to ground the prompt to.
383
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
384
+ fine-tuned model ID here to use it.
347
385
 
348
386
  Returns:
349
387
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -369,18 +407,52 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
369
407
  },
370
408
  ]
371
409
  """
372
- buffer_bytes = numpy_to_bytes(image)
410
+ if fine_tune_id is not None:
411
+ image_b64 = convert_to_b64(image)
412
+ landing_api = LandingPublicAPI()
413
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
414
+ if status is not JobStatus.SUCCEEDED:
415
+ raise FineTuneModelIsNotReady(
416
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
417
+ )
373
418
 
419
+ req_data_obj = Florence2FtRequest(
420
+ image=image_b64,
421
+ task=PromptTask.PHRASE_GROUNDING,
422
+ tool="florencev2_fine_tuning",
423
+ prompt=prompt,
424
+ fine_tuning=FineTuning(
425
+ job_id=UUID(fine_tune_id),
426
+ postprocessing="sam2",
427
+ ),
428
+ )
429
+ req_data = req_data_obj.model_dump(by_alias=True)
430
+ detections_ft = send_inference_request(req_data, "tools", v2=False)
431
+ detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
432
+ return_data = []
433
+ all_masks = np.array(detections_ft["masks"])
434
+ for i in range(len(detections_ft["bboxes"])):
435
+ return_data.append(
436
+ {
437
+ "score": 1.0,
438
+ "label": detections_ft["labels"][i],
439
+ "bbox": detections_ft["bboxes"][i],
440
+ "mask": all_masks[i, :, :].astype(np.uint8),
441
+ }
442
+ )
443
+ return return_data
444
+
445
+ buffer_bytes = numpy_to_bytes(image)
374
446
  files = [("image", buffer_bytes)]
375
447
  payload = {
376
448
  "prompts": [s.strip() for s in prompt.split(",")],
377
449
  "function_name": "florence2_sam2_image",
378
450
  }
379
- data: Dict[str, Any] = send_inference_request(
451
+ detections: Dict[str, Any] = send_inference_request(
380
452
  payload, "florence2-sam2", files=files, v2=True
381
453
  )
382
454
  return_data = []
383
- for _, data_i in data["0"].items():
455
+ for _, data_i in detections["0"].items():
384
456
  mask = rle_decode_array(data_i["mask"])
385
457
  label = data_i["label"]
386
458
  bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
@@ -389,17 +461,19 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
389
461
 
390
462
 
391
463
  def florence2_sam2_video_tracking(
392
- prompt: str, frames: List[np.ndarray]
464
+ prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = None
393
465
  ) -> List[List[Dict[str, Any]]]:
394
466
  """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
395
467
  entities in a video given a text prompt such as category names or referring
396
468
  expressions. You can optionally separate the categories in the text with commas. It
397
- only tracks entities present in the first frame and only returns segmentation
398
- masks. It is useful for tracking and counting without duplicating counts.
469
+ can find new objects every 'chunk_length' frames and is useful for tracking and
470
+ counting without duplicating counts and always outputs scores of 1.0.
399
471
 
400
472
  Parameters:
401
473
  prompt (str): The prompt to ground to the video.
402
474
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
475
+ chunk_length (Optional[int]): The number of frames to re-run florence2 to find
476
+ new objects.
403
477
 
404
478
  Returns:
405
479
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -432,6 +506,8 @@ def florence2_sam2_video_tracking(
432
506
  "prompts": [s.strip() for s in prompt.split(",")],
433
507
  "function_name": "florence2_sam2_video_tracking",
434
508
  }
509
+ if chunk_length is not None:
510
+ payload["chunk_length"] = chunk_length # type: ignore
435
511
  data: Dict[str, Any] = send_inference_request(
436
512
  payload, "florence2-sam2", files=files, v2=True
437
513
  )
@@ -1119,13 +1195,13 @@ def florence2_phrase_grounding(
1119
1195
  return_data = []
1120
1196
  for i in range(len(detections["bboxes"])):
1121
1197
  return_data.append(
1122
- {
1123
- "score": 1.0,
1124
- "label": detections["labels"][i],
1125
- "bbox": normalize_bbox(detections["bboxes"][i], image_size),
1126
- }
1198
+ ODResponseData(
1199
+ label=detections["labels"][i],
1200
+ bbox=normalize_bbox(detections["bboxes"][i], image_size),
1201
+ score=1.0,
1202
+ )
1127
1203
  )
1128
- return return_data
1204
+ return [bbox.model_dump() for bbox in return_data]
1129
1205
 
1130
1206
 
1131
1207
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1497,12 +1573,14 @@ def closest_box_distance(
1497
1573
  # Utility and visualization functions
1498
1574
 
1499
1575
 
1500
- def extract_frames(
1576
+ def extract_frames_and_timestamps(
1501
1577
  video_uri: Union[str, Path], fps: float = 1
1502
- ) -> List[Tuple[np.ndarray, float]]:
1503
- """'extract_frames' extracts frames from a video which can be a file path, url or
1504
- youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
1505
- relative time in seconds where the frame was captured. The frame is a numpy array.
1578
+ ) -> List[Dict[str, Union[np.ndarray, float]]]:
1579
+ """'extract_frames_and_timestamps' extracts frames and timestamps from a video
1580
+ which can be a file path, url or youtube link, returns a list of dictionaries
1581
+ with keys "frame" and "timestamp" where "frame" is a numpy array and "timestamp" is
1582
+ the relative time in seconds where the frame was captured. The frame is a numpy
1583
+ array.
1506
1584
 
1507
1585
  Parameters:
1508
1586
  video_uri (Union[str, Path]): The path to the video file, url or youtube link
@@ -1510,15 +1588,23 @@ def extract_frames(
1510
1588
  to 1.
1511
1589
 
1512
1590
  Returns:
1513
- List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
1514
- as a numpy array and the timestamp in seconds.
1591
+ List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
1592
+ extracted frame as a numpy array and the timestamp in seconds.
1515
1593
 
1516
1594
  Example
1517
1595
  -------
1518
1596
  >>> extract_frames("path/to/video.mp4")
1519
- [(frame1, 0.0), (frame2, 0.5), ...]
1597
+ [{"frame": np.ndarray, "timestamp": 0.0}, ...]
1520
1598
  """
1521
1599
 
1600
+ def reformat(
1601
+ frames_and_timestamps: List[Tuple[np.ndarray, float]]
1602
+ ) -> List[Dict[str, Union[np.ndarray, float]]]:
1603
+ return [
1604
+ {"frame": frame, "timestamp": timestamp}
1605
+ for frame, timestamp in frames_and_timestamps
1606
+ ]
1607
+
1522
1608
  if str(video_uri).startswith(
1523
1609
  (
1524
1610
  "http://www.youtube.com/",
@@ -1540,16 +1626,16 @@ def extract_frames(
1540
1626
  raise Exception("No suitable video stream found")
1541
1627
  video_file_path = video.download(output_path=temp_dir)
1542
1628
 
1543
- return extract_frames_from_video(video_file_path, fps)
1629
+ return reformat(extract_frames_from_video(video_file_path, fps))
1544
1630
  elif str(video_uri).startswith(("http", "https")):
1545
1631
  _, image_suffix = os.path.splitext(video_uri)
1546
1632
  with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
1547
1633
  # Download the video and save it to the temporary file
1548
1634
  with urllib.request.urlopen(str(video_uri)) as response:
1549
1635
  tmp_file.write(response.read())
1550
- return extract_frames_from_video(tmp_file.name, fps)
1636
+ return reformat(extract_frames_from_video(tmp_file.name, fps))
1551
1637
 
1552
- return extract_frames_from_video(str(video_uri), fps)
1638
+ return reformat(extract_frames_from_video(str(video_uri), fps))
1553
1639
 
1554
1640
 
1555
1641
  def save_json(data: Any, file_path: str) -> None:
@@ -1953,7 +2039,6 @@ FUNCTION_TOOLS = [
1953
2039
  vit_image_classification,
1954
2040
  vit_nsfw_classification,
1955
2041
  countgd_counting,
1956
- florence2_image_caption,
1957
2042
  florence2_ocr,
1958
2043
  florence2_sam2_image,
1959
2044
  florence2_sam2_video_tracking,
@@ -1968,7 +2053,7 @@ FUNCTION_TOOLS = [
1968
2053
  ]
1969
2054
 
1970
2055
  UTIL_TOOLS = [
1971
- extract_frames,
2056
+ extract_frames_and_timestamps,
1972
2057
  save_json,
1973
2058
  load_image,
1974
2059
  save_image,
@@ -28,6 +28,7 @@ class FineTuning(BaseModel):
28
28
  model_config = ConfigDict(populate_by_name=True)
29
29
 
30
30
  job_id: UUID = Field(alias="jobId")
31
+ postprocessing: Optional[str] = None
31
32
 
32
33
  @field_serializer("job_id")
33
34
  def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
@@ -5,7 +5,7 @@ import io
5
5
  from importlib import resources
6
6
  from io import BytesIO
7
7
  from pathlib import Path
8
- from typing import Dict, List, Tuple, Union
8
+ from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
10
  import numpy as np
11
11
  from PIL import Image, ImageDraw, ImageFont
@@ -154,15 +154,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
154
154
  )
155
155
 
156
156
 
157
- def encode_image_bytes(image: bytes) -> str:
158
- image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
157
+ def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
158
+ if resize is not None:
159
+ image_pil = Image.open(io.BytesIO(image)).convert("RGB")
160
+ if image_pil.size[0] > resize or image_pil.size[1] > resize:
161
+ image_pil.thumbnail((resize, resize))
162
+ else:
163
+ image_pil = Image.open(io.BytesIO(image)).convert("RGB")
159
164
  buffer = io.BytesIO()
160
- image.save(buffer, format="PNG") # type: ignore
165
+ image_pil.save(buffer, format="PNG")
161
166
  encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
162
167
  return encoded_image
163
168
 
164
169
 
165
- def encode_media(media: Union[str, Path]) -> str:
170
+ def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
166
171
  if isinstance(media, str) and media.startswith(("http", "https")):
167
172
  # for mp4 video url, we assume there is a same url but ends with png
168
173
  # vision-agent-ui will upload this png when uploading the video
@@ -192,11 +197,17 @@ def encode_media(media: Union[str, Path]) -> str:
192
197
  frames = extract_frames_from_video(str(media), fps=1)
193
198
  image = frames[len(frames) // 2]
194
199
  buffer = io.BytesIO()
195
- Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
200
+ if resize is not None:
201
+ image_pil = Image.fromarray(image[0]).convert("RGB")
202
+ if image_pil.size[0] > resize or image_pil.size[1] > resize:
203
+ image_pil.thumbnail((resize, resize))
204
+ else:
205
+ image_pil = Image.fromarray(image[0]).convert("RGB")
206
+ image_pil.save(buffer, format="PNG")
196
207
  image_bytes = buffer.getvalue()
197
208
  else:
198
209
  image_bytes = open(media, "rb").read()
199
- return encode_image_bytes(image_bytes)
210
+ return encode_image_bytes(image_bytes, resize=resize)
200
211
 
201
212
 
202
213
  def denormalize_bbox(
@@ -61,6 +61,7 @@ def video_writer(
61
61
  stream.height = height - (height % 2)
62
62
  stream.width = width - (width % 2)
63
63
  stream.pix_fmt = "yuv420p"
64
+ stream.options = {"crf": "10"}
64
65
  for frame in frames:
65
66
  # Remove the alpha channel (convert RGBA to RGB)
66
67
  frame_rgb = frame[:, :, :3]
@@ -77,7 +78,7 @@ def video_writer(
77
78
 
78
79
 
79
80
  def frames_to_bytes(
80
- frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
81
+ frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
81
82
  ) -> bytes:
82
83
  r"""Convert a list of frames to a video file encoded into a byte string.
83
84
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.140
3
+ Version: 0.2.142
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -74,10 +74,11 @@ To get started, you can install the library using pip:
74
74
  pip install vision-agent
75
75
  ```
76
76
 
77
- Ensure you have an OpenAI API key and set it as an environment variable (if you are
78
- using Azure OpenAI please see the Azure setup section):
77
+ Ensure you have an Anthropic key and an OpenAI API key and set in your environment
78
+ variables (if you are using Azure OpenAI please see the Azure setup section):
79
79
 
80
80
  ```bash
81
+ export ANTHROPIC_API_KEY="your-api-key"
81
82
  export OPENAI_API_KEY="your-api-key"
82
83
  ```
83
84
 
@@ -112,6 +113,9 @@ You can find more details about the streamlit app [here](examples/chat/).
112
113
  >>> resp = agent(resp)
113
114
  ```
114
115
 
116
+ `VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
117
+ embeddings for tool searching.
118
+
115
119
  ### Vision Agent Coder
116
120
  #### Basic Usage
117
121
  You can interact with the agent as you would with any LLM or LMM model:
@@ -173,7 +177,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
173
177
  "code": "from vision_agent.tools import ..."
174
178
  "test": "calculate_filled_percentage('jar.jpg')",
175
179
  "test_result": "...",
176
- "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
180
+ "plans": {"plan1": {"thoughts": "..."}, ...},
181
+ "plan_thoughts": "...",
177
182
  "working_memory": ...,
178
183
  }
179
184
  ```
@@ -210,20 +215,25 @@ result = agent.chat_with_workflow(conv)
210
215
  ### Tools
211
216
  There are a variety of tools for the model or the user to use. Some are executed locally
212
217
  while others are hosted for you. You can easily access them yourself, for example if
213
- you want to run `owl_v2` and visualize the output you can run:
218
+ you want to run `owl_v2_image` and visualize the output you can run:
214
219
 
215
220
  ```python
216
221
  import vision_agent.tools as T
217
222
  import matplotlib.pyplot as plt
218
223
 
219
224
  image = T.load_image("dogs.jpg")
220
- dets = T.owl_v2("dogs", image)
225
+ dets = T.owl_v2_image("dogs", image)
221
226
  viz = T.overlay_bounding_boxes(image, dets)
222
227
  plt.imshow(viz)
223
228
  plt.show()
224
229
  ```
225
230
 
226
- You can also add custom tools to the agent:
231
+ You can find all available tools in `vision_agent/tools/tools.py`, however,
232
+ `VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
233
+ the best performance. Those can be found in the same file under the `TOOLS` variable.
234
+
235
+ If you can't find the tool you are looking for you can also add custom tools to the
236
+ agent:
227
237
 
228
238
  ```python
229
239
  import vision_agent as va
@@ -258,9 +268,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
258
268
  we add the source code for all the tools used in `VisionAgent`.
259
269
 
260
270
  ## Additional Backends
271
+ ### Anthropic
272
+ `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
273
+ Anthropic API key and set it in your environment variables:
274
+
275
+ ```bash
276
+ export ANTHROPIC_API_KEY="your-api-key"
277
+ ```
278
+
279
+ Because Anthropic does not support embedding models, the default embedding model used
280
+ is the OpenAI model so you will also need to set your OpenAI API key:
281
+
282
+ ```bash
283
+ export OPEN_AI_API_KEY="your-api-key"
284
+ ```
285
+
286
+ Usage is the same as `VisionAgentCoder`:
287
+
288
+ ```python
289
+ >>> import vision_agent as va
290
+ >>> agent = va.agent.AnthropicVisionAgentCoder()
291
+ >>> agent("Count the apples in the image", media="apples.jpg")
292
+ ```
293
+
294
+ ### OpenAI
295
+ `OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
296
+ key and set it in your environment variables:
297
+
298
+ ```bash
299
+ export OPEN_AI_API_KEY="your-api-key"
300
+ ```
301
+
302
+ Usage is the same as `VisionAgentCoder`:
303
+
304
+ ```python
305
+ >>> import vision_agent as va
306
+ >>> agent = va.agent.OpenAIVisionAgentCoder()
307
+ >>> agent("Count the apples in the image", media="apples.jpg")
308
+ ```
309
+
310
+
261
311
  ### Ollama
262
- We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
263
- a few models:
312
+ `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
264
313
 
265
314
  ```bash
266
315
  ollama pull llama3.1
@@ -281,9 +330,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
281
330
  > WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
282
331
 
283
332
  ### Azure OpenAI
284
- We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
285
- follow the Azure Setup section below. You can use it just like you would use=
286
- `VisionAgentCoder`:
333
+ `AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
334
+ section below. You can use it just like you would use `VisionAgentCoder`:
287
335
 
288
336
  ```python
289
337
  >>> import vision_agent as va
@@ -0,0 +1,33 @@
1
+ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
+ vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
3
+ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
+ vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
5
+ vision_agent/agent/vision_agent.py,sha256=k1bUmvoz0KjVEu62PYA9djnq3pqzv2S1UsW6gLnTd7w,17023
6
+ vision_agent/agent/vision_agent_coder.py,sha256=4bbebV1sKE10vsxcZR-R8P54X2HjLeU9lDt7ylIZAT4,38429
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=YWK4C--YRS1Kuab11Gn-AXBzar1j_GNnTnxi_nnaPRY,14901
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=e_ASPeRFU1yZsQhCkK_bIBG-eyIWyWXmN64lFk-r7e0,10897
9
+ vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
+ vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
12
+ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
14
+ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
+ vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
+ vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
+ vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
18
+ vision_agent/tools/meta_tools.py,sha256=XO5Ahe5ZauomynxgDcBuzmm0ocXwTnmZ0wjfgvOzDWc,23426
19
+ vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
+ vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
21
+ vision_agent/tools/tools.py,sha256=dD_8AmAQb0oKVZHg2w2kSKlvWrG9yaKRbaHTz_kHgjA,73648
22
+ vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
23
+ vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
+ vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
+ vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
26
+ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
27
+ vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
+ vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
+ vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
+ vision_agent-0.2.142.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.142.dist-info/METADATA,sha256=yP7ShheLQ_a50CME1rbSUifRlc4ylqmM6PeIKflW9Ig,13758
32
+ vision_agent-0.2.142.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.142.dist-info/RECORD,,
@@ -1,33 +0,0 @@
1
- vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
3
- vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
- vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
5
- vision_agent/agent/vision_agent.py,sha256=Ed10_rWzHu-hejb5jF9lAF7xbmQ_qAGpCxDvByZw6M8,14100
6
- vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
8
- vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
9
- vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
- vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
12
- vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
14
- vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
- vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
16
- vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
18
- vision_agent/tools/meta_tools.py,sha256=orYbEPWOENXwmKSmbg52_2eMAoYT9ZbV5GjudUd-f0o,22563
19
- vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
- vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
21
- vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
22
- vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
23
- vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
- vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
- vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
26
- vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
27
- vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
- vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
- vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
30
- vision_agent-0.2.140.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.140.dist-info/METADATA,sha256=B33v0XI-5ZlEBBu-I8DT7JrbU04PophTYEmRQMVEkBQ,12291
32
- vision_agent-0.2.140.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.140.dist-info/RECORD,,