vision-agent 0.2.75__py3-none-any.whl → 0.2.77__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,10 +12,18 @@ from .tools import (
12
12
  closest_box_distance,
13
13
  closest_mask_distance,
14
14
  extract_frames,
15
+ florencev2_image_caption,
15
16
  get_tool_documentation,
17
+ florencev2_object_detection,
18
+ detr_segmentation,
19
+ depth_anything_v2,
20
+ generate_soft_edge_image,
21
+ dpt_hybrid_midas,
22
+ generate_pose_image,
16
23
  git_vqa_v2,
17
24
  grounding_dino,
18
25
  grounding_sam,
26
+ florencev2_roberta_vqa,
19
27
  load_image,
20
28
  loca_visual_prompt_counting,
21
29
  loca_zero_shot_counting,
@@ -27,6 +35,7 @@ from .tools import (
27
35
  save_image,
28
36
  save_json,
29
37
  save_video,
38
+ template_match,
30
39
  vit_image_classification,
31
40
  vit_nsfw_classification,
32
41
  )
@@ -14,6 +14,7 @@ import requests
14
14
  from moviepy.editor import ImageSequenceClip
15
15
  from PIL import Image, ImageDraw, ImageFont
16
16
  from pillow_heif import register_heif_opener # type: ignore
17
+ from pytube import YouTube # type: ignore
17
18
 
18
19
  from vision_agent.tools.tool_utils import send_inference_request
19
20
  from vision_agent.utils import extract_frames_from_video
@@ -126,7 +127,7 @@ def owl_v2(
126
127
  ) -> List[Dict[str, Any]]:
127
128
  """'owl_v2' is a tool that can detect and count multiple objects given a text
128
129
  prompt such as category names or referring expressions. The categories in text prompt
129
- are separated by commas or periods. It returns a list of bounding boxes with
130
+ are separated by commas. It returns a list of bounding boxes with
130
131
  normalized coordinates, label names and associated probability scores.
131
132
 
132
133
  Parameters:
@@ -136,7 +137,6 @@ def owl_v2(
136
137
  to 0.10.
137
138
  iou_threshold (float, optional): The threshold for the Intersection over Union
138
139
  (IoU). Defaults to 0.10.
139
- model_size (str, optional): The size of the model to use.
140
140
 
141
141
  Returns:
142
142
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -180,7 +180,7 @@ def grounding_sam(
180
180
  box_threshold: float = 0.20,
181
181
  iou_threshold: float = 0.20,
182
182
  ) -> List[Dict[str, Any]]:
183
- """'grounding_sam' is a tool that can detect and segment multiple objects given a
183
+ """'grounding_sam' is a tool that can segment multiple objects given a
184
184
  text prompt such as category names or referring expressions. The categories in text
185
185
  prompt are separated by commas or periods. It returns a list of bounding boxes,
186
186
  label names, mask file names and associated probability scores.
@@ -242,12 +242,12 @@ def grounding_sam(
242
242
  def extract_frames(
243
243
  video_uri: Union[str, Path], fps: float = 0.5
244
244
  ) -> List[Tuple[np.ndarray, float]]:
245
- """'extract_frames' extracts frames from a video, returns a list of tuples (frame,
246
- timestamp), where timestamp is the relative time in seconds where the frame was
247
- captured. The frame is a numpy array.
245
+ """'extract_frames' extracts frames from a video which can be a file path or youtube
246
+ link, returns a list of tuples (frame, timestamp), where timestamp is the relative
247
+ time in seconds where the frame was captured. The frame is a numpy array.
248
248
 
249
249
  Parameters:
250
- video_uri (Union[str, Path]): The path to the video file.
250
+ video_uri (Union[str, Path]): The path to the video file or youtube link
251
251
  fps (float, optional): The frame rate per second to extract the frames. Defaults
252
252
  to 0.5.
253
253
 
@@ -261,6 +261,29 @@ def extract_frames(
261
261
  [(frame1, 0.0), (frame2, 0.5), ...]
262
262
  """
263
263
 
264
+ if str(video_uri).startswith(
265
+ (
266
+ "http://www.youtube.com/",
267
+ "https://www.youtube.com/",
268
+ "http://youtu.be/",
269
+ "https://youtu.be/",
270
+ )
271
+ ):
272
+ with tempfile.TemporaryDirectory() as temp_dir:
273
+ yt = YouTube(str(video_uri))
274
+ # Download the highest resolution video
275
+ video = (
276
+ yt.streams.filter(progressive=True, file_extension="mp4")
277
+ .order_by("resolution")
278
+ .desc()
279
+ .first()
280
+ )
281
+ if not video:
282
+ raise Exception("No suitable video stream found")
283
+ video_file_path = video.download(output_path=temp_dir)
284
+
285
+ return extract_frames_from_video(video_file_path, fps)
286
+
264
287
  return extract_frames_from_video(str(video_uri), fps)
265
288
 
266
289
 
@@ -381,6 +404,35 @@ def loca_visual_prompt_counting(
381
404
  return resp_data
382
405
 
383
406
 
407
+ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
408
+ """'florencev2_roberta_vqa' is a tool that takes an image and analyzes
409
+ its contents, generates detailed captions and then tries to answer the given
410
+ question using the generated context. It returns text as an answer to the question.
411
+
412
+ Parameters:
413
+ prompt (str): The question about the image
414
+ image (np.ndarray): The reference image used for the question
415
+
416
+ Returns:
417
+ str: A string which is the answer to the given prompt.
418
+
419
+ Example
420
+ -------
421
+ >>> florencev2_roberta_vqa('What is the top left animal in this image ?', image)
422
+ 'white tiger'
423
+ """
424
+
425
+ image_b64 = convert_to_b64(image)
426
+ data = {
427
+ "image": image_b64,
428
+ "prompt": prompt,
429
+ "tool": "image_question_answering_with_context",
430
+ }
431
+
432
+ answer = send_inference_request(data, "tools")
433
+ return answer["text"][0] # type: ignore
434
+
435
+
384
436
  def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
385
437
  """'git_vqa_v2' is a tool that can answer questions about the visual
386
438
  contents of an image given a question and an image. It returns an answer to the
@@ -391,8 +443,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
391
443
  image (np.ndarray): The reference image used for the question
392
444
 
393
445
  Returns:
394
- str: A string which is the answer to the given prompt. E.g. {'text': 'This
395
- image contains a cat sitting on a table with a bowl of milk.'}.
446
+ str: A string which is the answer to the given prompt.
396
447
 
397
448
  Example
398
449
  -------
@@ -521,6 +572,309 @@ def blip_image_caption(image: np.ndarray) -> str:
521
572
  return answer["text"][0] # type: ignore
522
573
 
523
574
 
575
+ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
576
+ """'florencev2_image_caption' is a tool that can caption or describe an image based
577
+ on its contents. It returns a text describing the image.
578
+
579
+ Parameters:
580
+ image (np.ndarray): The image to caption
581
+ detail_caption (bool): If True, the caption will be as detailed as possible else
582
+ the caption will be a brief description.
583
+
584
+ Returns:
585
+ str: A string which is the caption for the given image.
586
+
587
+ Example
588
+ -------
589
+ >>> florencev2_image_caption(image, False)
590
+ 'This image contains a cat sitting on a table with a bowl of milk.'
591
+ """
592
+ image_b64 = convert_to_b64(image)
593
+ data = {
594
+ "image": image_b64,
595
+ "tool": "florence2_image_captioning",
596
+ "detail_caption": detail_caption,
597
+ }
598
+
599
+ answer = send_inference_request(data, "tools")
600
+ return answer["text"][0] # type: ignore
601
+
602
+
603
+ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
604
+ """'florencev2_object_detection' is a tool that can detect common objects in an
605
+ image without any text prompt or thresholding. It returns a list of detected objects
606
+ as labels and their location as bounding boxes.
607
+
608
+ Parameters:
609
+ image (np.ndarray): The image to used to detect objects
610
+
611
+ Returns:
612
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
613
+ bounding box of the detected objects with normalized coordinates between 0
614
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
615
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
616
+ bounding box. The scores are always 1.0 and cannot be thresholded
617
+
618
+ Example
619
+ -------
620
+ >>> florencev2_object_detection(image)
621
+ [
622
+ {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
623
+ {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
624
+ {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
625
+ ]
626
+ """
627
+ image_size = image.shape[:2]
628
+ image_b64 = convert_to_b64(image)
629
+ data = {
630
+ "image": image_b64,
631
+ "tool": "object_detection",
632
+ }
633
+
634
+ answer = send_inference_request(data, "tools")
635
+ return_data = []
636
+ for i in range(len(answer["bboxes"])):
637
+ return_data.append(
638
+ {
639
+ "score": round(answer["scores"][i], 2),
640
+ "label": answer["labels"][i],
641
+ "bbox": normalize_bbox(answer["bboxes"][i], image_size),
642
+ }
643
+ )
644
+ return return_data
645
+
646
+
647
+ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
648
+ """'detr_segmentation' is a tool that can segment common objects in an
649
+ image without any text prompt. It returns a list of detected objects
650
+ as labels, their regions as masks and their scores.
651
+
652
+ Parameters:
653
+ image (np.ndarray): The image used to segment things and objects
654
+
655
+ Returns:
656
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label
657
+ and mask of the detected objects. The mask is binary 2D numpy array where 1
658
+ indicates the object and 0 indicates the background.
659
+
660
+ Example
661
+ -------
662
+ >>> detr_segmentation(image)
663
+ [
664
+ {
665
+ 'score': 0.45,
666
+ 'label': 'window',
667
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
668
+ [0, 0, 0, ..., 0, 0, 0],
669
+ ...,
670
+ [0, 0, 0, ..., 0, 0, 0],
671
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
672
+ },
673
+ {
674
+ 'score': 0.70,
675
+ 'label': 'bird',
676
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
677
+ [0, 0, 0, ..., 0, 0, 0],
678
+ ...,
679
+ [0, 0, 0, ..., 0, 0, 0],
680
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
681
+ },
682
+ ]
683
+ """
684
+ image_b64 = convert_to_b64(image)
685
+ data = {
686
+ "image": image_b64,
687
+ "tool": "panoptic_segmentation",
688
+ }
689
+
690
+ answer = send_inference_request(data, "tools")
691
+ return_data = []
692
+
693
+ for i in range(len(answer["scores"])):
694
+ return_data.append(
695
+ {
696
+ "score": round(answer["scores"][i], 2),
697
+ "label": answer["labels"][i],
698
+ "mask": rle_decode(
699
+ mask_rle=answer["masks"][i], shape=answer["mask_shape"][0]
700
+ ),
701
+ }
702
+ )
703
+ return return_data
704
+
705
+
706
+ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
707
+ """'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
708
+ depth image from a given RGB image. The returned depth image is monochrome and
709
+ represents depth values as pixel intesities with pixel values ranging from 0 to 255.
710
+
711
+ Parameters:
712
+ image (np.ndarray): The image to used to generate depth image
713
+
714
+ Returns:
715
+ np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
716
+
717
+ Example
718
+ -------
719
+ >>> depth_anything_v2(image)
720
+ array([[0, 0, 0, ..., 0, 0, 0],
721
+ [0, 20, 24, ..., 0, 100, 103],
722
+ ...,
723
+ [10, 11, 15, ..., 202, 202, 205],
724
+ [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
725
+ """
726
+ image_b64 = convert_to_b64(image)
727
+ data = {
728
+ "image": image_b64,
729
+ "tool": "generate_depth",
730
+ }
731
+
732
+ answer = send_inference_request(data, "tools")
733
+ return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
734
+ return return_data
735
+
736
+
737
+ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
738
+ """'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
739
+ to generate a soft edge image (HED) from a given RGB image. The returned image is
740
+ monochrome and represents object boundaries as soft white edges on black background
741
+
742
+ Parameters:
743
+ image (np.ndarray): The image to used to generate soft edge image
744
+
745
+ Returns:
746
+ np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
747
+
748
+ Example
749
+ -------
750
+ >>> generate_soft_edge_image(image)
751
+ array([[0, 0, 0, ..., 0, 0, 0],
752
+ [0, 20, 24, ..., 0, 100, 103],
753
+ ...,
754
+ [10, 11, 15, ..., 202, 202, 205],
755
+ [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
756
+ """
757
+ image_b64 = convert_to_b64(image)
758
+ data = {
759
+ "image": image_b64,
760
+ "tool": "generate_hed",
761
+ }
762
+
763
+ answer = send_inference_request(data, "tools")
764
+ return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
765
+ return return_data
766
+
767
+
768
+ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
769
+ """'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
770
+ image. The returned RGB image is texture mapped image of the surface normals and the
771
+ RGB values represent the surface normals in the x, y, z directions.
772
+
773
+ Parameters:
774
+ image (np.ndarray): The image to used to generate normal image
775
+
776
+ Returns:
777
+ np.ndarray: A mapped normal image with RGB pixel values indicating surface
778
+ normals in x, y, z directions.
779
+
780
+ Example
781
+ -------
782
+ >>> dpt_hybrid_midas(image)
783
+ array([[0, 0, 0, ..., 0, 0, 0],
784
+ [0, 20, 24, ..., 0, 100, 103],
785
+ ...,
786
+ [10, 11, 15, ..., 202, 202, 205],
787
+ [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
788
+ """
789
+ image_b64 = convert_to_b64(image)
790
+ data = {
791
+ "image": image_b64,
792
+ "tool": "generate_normal",
793
+ }
794
+
795
+ answer = send_inference_request(data, "tools")
796
+ return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
797
+ return return_data
798
+
799
+
800
+ def generate_pose_image(image: np.ndarray) -> np.ndarray:
801
+ """'generate_pose_image' is a tool that generates a open pose bone/stick image from
802
+ a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
803
+ and background as black.
804
+
805
+ Parameters:
806
+ image (np.ndarray): The image to used to generate pose image
807
+
808
+ Returns:
809
+ np.ndarray: A bone or pose image indicating the pose and keypoints
810
+
811
+ Example
812
+ -------
813
+ >>> generate_pose_image(image)
814
+ array([[0, 0, 0, ..., 0, 0, 0],
815
+ [0, 20, 24, ..., 0, 100, 103],
816
+ ...,
817
+ [10, 11, 15, ..., 202, 202, 205],
818
+ [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
819
+ """
820
+ image_b64 = convert_to_b64(image)
821
+ data = {
822
+ "image": image_b64,
823
+ "tool": "generate_pose",
824
+ }
825
+
826
+ answer = send_inference_request(data, "tools")
827
+ return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
828
+ return return_data
829
+
830
+
831
+ def template_match(
832
+ image: np.ndarray, template_image: np.ndarray
833
+ ) -> List[Dict[str, Any]]:
834
+ """'template_match' is a tool that can detect all instances of a template in
835
+ a given image. It returns the locations of the detected template, a corresponding
836
+ similarity score of the same
837
+
838
+ Parameters:
839
+ image (np.ndarray): The image used for searching the template
840
+ template_image (np.ndarray): The template image or crop to search in the image
841
+
842
+ Returns:
843
+ List[Dict[str, Any]]: A list of dictionaries containing the score and
844
+ bounding box of the detected template with normalized coordinates between 0
845
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
846
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
847
+ bounding box.
848
+
849
+ Example
850
+ -------
851
+ >>> template_match(image, template)
852
+ [
853
+ {'score': 0.79, 'bbox': [0.1, 0.11, 0.35, 0.4]},
854
+ {'score': 0.38, 'bbox': [0.2, 0.21, 0.45, 0.5},
855
+ ]
856
+ """
857
+ image_size = image.shape[:2]
858
+ image_b64 = convert_to_b64(image)
859
+ template_image_b64 = convert_to_b64(template_image)
860
+ data = {
861
+ "image": image_b64,
862
+ "template": template_image_b64,
863
+ "tool": "template_match",
864
+ }
865
+
866
+ answer = send_inference_request(data, "tools")
867
+ return_data = []
868
+ for i in range(len(answer["bboxes"])):
869
+ return_data.append(
870
+ {
871
+ "score": round(answer["scores"][i], 2),
872
+ "bbox": normalize_bbox(answer["bboxes"][i], image_size),
873
+ }
874
+ )
875
+ return return_data
876
+
877
+
524
878
  def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
525
879
  """'closest_mask_distance' calculates the closest distance between two masks.
526
880
 
@@ -733,7 +1087,7 @@ def overlay_bounding_boxes(
733
1087
  image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
734
1088
  )
735
1089
  """
736
- pil_image = Image.fromarray(image.astype(np.uint8))
1090
+ pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
737
1091
 
738
1092
  if len(set([box["label"] for box in bboxes])) > len(COLORS):
739
1093
  _LOGGER.warning(
@@ -920,8 +1274,14 @@ TOOLS = [
920
1274
  vit_nsfw_classification,
921
1275
  loca_zero_shot_counting,
922
1276
  loca_visual_prompt_counting,
923
- git_vqa_v2,
924
- blip_image_caption,
1277
+ florencev2_roberta_vqa,
1278
+ florencev2_image_caption,
1279
+ florencev2_object_detection,
1280
+ detr_segmentation,
1281
+ depth_anything_v2,
1282
+ generate_soft_edge_image,
1283
+ dpt_hybrid_midas,
1284
+ generate_pose_image,
925
1285
  closest_mask_distance,
926
1286
  closest_box_distance,
927
1287
  save_json,
@@ -931,6 +1291,7 @@ TOOLS = [
931
1291
  overlay_bounding_boxes,
932
1292
  overlay_segmentation_masks,
933
1293
  overlay_heat_map,
1294
+ template_match,
934
1295
  ]
935
1296
  TOOLS_DF = get_tools_df(TOOLS) # type: ignore
936
1297
  TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
@@ -11,10 +11,9 @@ import tempfile
11
11
  import traceback
12
12
  import warnings
13
13
  from enum import Enum
14
- from io import IOBase
15
14
  from pathlib import Path
16
15
  from time import sleep
17
- from typing import IO, Any, Dict, Iterable, List, Optional, Union, cast
16
+ from typing import Any, Dict, Iterable, List, Optional, Union
18
17
 
19
18
  import nbformat
20
19
  import tenacity
@@ -33,6 +32,7 @@ from typing_extensions import Self
33
32
 
34
33
  load_dotenv()
35
34
  _LOGGER = logging.getLogger(__name__)
35
+ _SESSION_TIMEOUT = 300 # 5 minutes
36
36
 
37
37
 
38
38
  class MimeType(str, Enum):
@@ -403,11 +403,8 @@ class CodeInterpreter(abc.ABC):
403
403
  self.restart_kernel()
404
404
  return self.exec_cell(code)
405
405
 
406
- def upload_file(self, file: Union[str, Path, IO]) -> str:
406
+ def upload_file(self, file: Union[str, Path]) -> str:
407
407
  # Default behavior is a no-op (for local code interpreter)
408
- assert not isinstance(
409
- file, IO
410
- ), "Don't pass IO objects to upload_file() of local interpreter"
411
408
  return str(file)
412
409
 
413
410
  def download_file(self, file_path: str) -> Path:
@@ -416,7 +413,6 @@ class CodeInterpreter(abc.ABC):
416
413
 
417
414
 
418
415
  class E2BCodeInterpreter(CodeInterpreter):
419
- KEEP_ALIVE_SEC: int = 300
420
416
 
421
417
  def __init__(self, *args: Any, **kwargs: Any) -> None:
422
418
  super().__init__(*args, **kwargs)
@@ -437,8 +433,8 @@ print(f"Vision Agent version: {va_version}")"""
437
433
  _LOGGER.info(f"E2BCodeInterpreter initialized:\n{sys_versions}")
438
434
 
439
435
  def close(self, *args: Any, **kwargs: Any) -> None:
440
- self.interpreter.notebook.close()
441
436
  self.interpreter.close()
437
+ self.interpreter.kill()
442
438
 
443
439
  def restart_kernel(self) -> None:
444
440
  self.interpreter.notebook.restart_kernel()
@@ -449,25 +445,27 @@ print(f"Vision Agent version: {va_version}")"""
449
445
  retry=tenacity.retry_if_exception_type(TimeoutError),
450
446
  )
451
447
  def exec_cell(self, code: str) -> Execution:
452
- self.interpreter.keep_alive(E2BCodeInterpreter.KEEP_ALIVE_SEC)
448
+ if not self.interpreter.is_running():
449
+ raise ConnectionResetError(
450
+ "Remote sandbox is closed unexpectedly. Please retry the operation."
451
+ )
452
+ self.interpreter.set_timeout(_SESSION_TIMEOUT) # Extend the life of the sandbox
453
453
  execution = self.interpreter.notebook.exec_cell(code, timeout=self.timeout)
454
454
  return Execution.from_e2b_execution(execution)
455
455
 
456
- def upload_file(self, file: Union[str, Path, IO]) -> str:
457
- try:
458
- if isinstance(file, (Path, str)):
459
- file = open(file, "rb")
460
- return cast(str, self.interpreter.upload_file(cast(IO, file)))
461
- finally:
462
- assert isinstance(file, IOBase), f"Unexpected file type: {type(file)}"
463
- file.close()
464
- _LOGGER.info(f"File ({file}) is uploaded to: {file.name}")
456
+ def upload_file(self, file: Union[str, Path]) -> str:
457
+ file_name = Path(file).name
458
+ remote_path = f"/home/user/{file_name}"
459
+ with open(file, "rb") as f:
460
+ self.interpreter.files.write(path=remote_path, data=f)
461
+ _LOGGER.info(f"File ({file}) is uploaded to: {remote_path}")
462
+ return remote_path
465
463
 
466
464
  def download_file(self, file_path: str) -> Path:
467
- file = tempfile.NamedTemporaryFile(mode="w+b", delete=False)
468
- file.write(self.interpreter.download_file(file_path))
469
- _LOGGER.info(f"File ({file_path}) is downloaded to: {file.name}")
470
- return Path(file.name)
465
+ with tempfile.NamedTemporaryFile(mode="w+b", delete=False) as file:
466
+ file.write(self.interpreter.files.read(path=file_path, format="bytes"))
467
+ _LOGGER.info(f"File ({file_path}) is downloaded to: {file.name}")
468
+ return Path(file.name)
471
469
 
472
470
  @staticmethod
473
471
  @tenacity.retry(
@@ -480,7 +478,7 @@ print(f"Vision Agent version: {va_version}")"""
480
478
 
481
479
 
482
480
  class LocalCodeInterpreter(CodeInterpreter):
483
- def __init__(self, timeout: int = 600) -> None:
481
+ def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None:
484
482
  super().__init__(timeout=timeout)
485
483
  self.nb = nbformat.v4.new_notebook()
486
484
  self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
@@ -568,9 +566,9 @@ class CodeInterpreterFactory:
568
566
  @staticmethod
569
567
  def new_instance() -> CodeInterpreter:
570
568
  if os.getenv("CODE_SANDBOX_RUNTIME") == "e2b":
571
- instance: CodeInterpreter = E2BCodeInterpreter(timeout=600)
569
+ instance: CodeInterpreter = E2BCodeInterpreter(timeout=_SESSION_TIMEOUT)
572
570
  else:
573
- instance = LocalCodeInterpreter(timeout=600)
571
+ instance = LocalCodeInterpreter(timeout=_SESSION_TIMEOUT)
574
572
  atexit.register(instance.close)
575
573
  return instance
576
574
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.75
3
+ Version: 0.2.77
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -10,7 +10,7 @@ Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Requires-Dist: e2b (>=0.17.1,<0.18.0)
13
- Requires-Dist: e2b-code-interpreter (>=0.0.9,<0.0.10)
13
+ Requires-Dist: e2b-code-interpreter (==0.0.11a1)
14
14
  Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
15
15
  Requires-Dist: langsmith (>=0.1.58,<0.2.0)
16
16
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
@@ -23,6 +23,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
23
23
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
24
24
  Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
25
25
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
26
+ Requires-Dist: pytube (==15.0.0)
26
27
  Requires-Dist: requests (>=2.0.0,<3.0.0)
27
28
  Requires-Dist: rich (>=13.7.1,<14.0.0)
28
29
  Requires-Dist: scipy (>=1.13.0,<1.14.0)
@@ -7,17 +7,17 @@ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
7
7
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
8
8
  vision_agent/lmm/__init__.py,sha256=bw24xyQJHGzmph5e-bKCiTh9AX6tRFI2OUd0mofxjZI,68
9
9
  vision_agent/lmm/lmm.py,sha256=TzzACjTP1MNSrHolUWY7fEJzdVfZELQyImRpT8IU_1E,11690
10
- vision_agent/tools/__init__.py,sha256=aE1O8cMeLDPO50Sc-CuAQ_Akh0viz7vBxDcVeZNqsA0,1604
10
+ vision_agent/tools/__init__.py,sha256=mF47kfi5X5jfboUxULJnWnFbv1M9uTmmCU3_0uBZVwk,1838
11
11
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
12
12
  vision_agent/tools/tool_utils.py,sha256=ZOY45bCX3nlo6iGwaZ8RVpRJB-vWxkXDed9oegT7-p0,1838
13
- vision_agent/tools/tools.py,sha256=TaDZIvYsYNleqDsETfoZiPWPBZjyimXhudLdFZ5NsLE,31386
13
+ vision_agent/tools/tools.py,sha256=TkZqNYX-ocwdaCdXd6c6tysSa_HX2y6Nrgl4JKni4IQ,43661
14
14
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
15
- vision_agent/utils/execute.py,sha256=QImS69SN00logF-E68aNpT7YsJVRQOhZYlNLmCNEfro,21337
15
+ vision_agent/utils/execute.py,sha256=DMaQz5-yULxDx-TlSMTRKOPHE7VmyR7PArhXXilm7h0,21368
16
16
  vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
17
17
  vision_agent/utils/sim.py,sha256=ci6Eta73dDgLP1Ajtknbgmf1g8aAvBHqlVQvBuLMKXQ,4427
18
18
  vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
19
19
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
20
- vision_agent-0.2.75.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
21
- vision_agent-0.2.75.dist-info/METADATA,sha256=Y0bkCNXRi71LZ09EFHasQ8HJ0dvX54mIeW1IkPfQhvo,9405
22
- vision_agent-0.2.75.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
23
- vision_agent-0.2.75.dist-info/RECORD,,
20
+ vision_agent-0.2.77.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
21
+ vision_agent-0.2.77.dist-info/METADATA,sha256=2GjXWlij7wzd19pzbjNRt__AVhpVtLcAe_WfGnydxTI,9433
22
+ vision_agent-0.2.77.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
23
+ vision_agent-0.2.77.dist-info/RECORD,,