vision-agent 1.1.11__py3-none-any.whl → 1.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -718,4 +718,4 @@ desc,doc,name
718
718
  [0, 0, 0, ..., 0, 0, 0],
719
719
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
720
720
  }],
721
- )",overlay_segmentation_masks
721
+ )",overlay_segmentation_masks
vision_agent/lmm/lmm.py CHANGED
@@ -3,12 +3,16 @@ import os
3
3
  from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
5
  from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
6
+ import base64
6
7
 
7
8
  import anthropic
8
9
  import requests
9
10
  from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
10
11
  from openai import AzureOpenAI, OpenAI
11
12
 
13
+ from google import genai # type: ignore
14
+ from google.genai import types # type: ignore
15
+
12
16
  from vision_agent.models import Message
13
17
  from vision_agent.utils.image_utils import encode_media
14
18
 
@@ -516,28 +520,152 @@ class AnthropicLMM(LMM):
516
520
  return cast(str, response.content[0].text)
517
521
 
518
522
 
519
- class GoogleLMM(OpenAILMM):
523
+ class GoogleLMM(LMM):
520
524
  r"""An LMM class for the Google LMMs."""
521
525
 
522
526
  def __init__(
523
527
  self,
528
+ model_name: str = "gemini-2.5-pro-preview-03-25",
524
529
  api_key: Optional[str] = None,
525
- model_name: str = "gemini-2.0-flash-exp",
526
- max_tokens: int = 4096,
527
- image_detail: str = "low",
528
530
  image_size: int = 768,
531
+ image_detail: str = "low",
529
532
  **kwargs: Any,
530
533
  ):
531
- base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
532
534
  if not api_key:
533
- api_key = os.environ.get("GEMINI_API_KEY")
534
-
535
- self.client = OpenAI(api_key=api_key, base_url=base_url)
535
+ api_key = os.environ.get("GOOGLE_API_KEY")
536
536
 
537
+ # Create the client using the Google Genai client
538
+ self.client = genai.Client(api_key=api_key)
537
539
  self.model_name = model_name
538
540
  self.image_size = image_size
539
541
  self.image_detail = image_detail
540
-
541
- if "max_tokens" not in kwargs:
542
- kwargs["max_tokens"] = max_tokens
543
542
  self.kwargs = kwargs
543
+
544
+ def __call__(
545
+ self,
546
+ input: Union[str, Sequence[Dict[str, Any]]],
547
+ **kwargs: Any,
548
+ ) -> Union[str, Iterator[Optional[str]]]:
549
+ if isinstance(input, str):
550
+ return self.generate(input, **kwargs)
551
+ return self.chat(input, **kwargs)
552
+
553
+ def chat(
554
+ self,
555
+ chat: Sequence[Dict[str, Any]],
556
+ **kwargs: Any,
557
+ ) -> Union[str, Iterator[Optional[str]]]:
558
+ prompt_parts = []
559
+ for message in chat:
560
+ if message["role"] != "user":
561
+ continue # Gemini expects only user input
562
+ prompt_parts.extend(self._convert_message_parts(message, **kwargs))
563
+
564
+ tmp_kwargs = self.kwargs | kwargs
565
+ generation_config = self._create_generation_config(tmp_kwargs)
566
+
567
+ if tmp_kwargs.get("stream"):
568
+
569
+ def f() -> Iterator[Optional[str]]:
570
+ # Use the client to stream content
571
+ response_stream = self.client.models.generate_content_stream(
572
+ model=self.model_name,
573
+ contents=prompt_parts,
574
+ config=generation_config,
575
+ )
576
+ for chunk in response_stream:
577
+ if chunk.text:
578
+ yield chunk.text
579
+
580
+ return f()
581
+ else:
582
+ # Use the client for non-streaming
583
+ response = self.client.models.generate_content(
584
+ model=self.model_name,
585
+ contents=prompt_parts,
586
+ config=generation_config,
587
+ )
588
+ return cast(str, response.text)
589
+
590
+ def generate(
591
+ self,
592
+ prompt: str,
593
+ media: Optional[Sequence[Union[str, Path]]] = None,
594
+ **kwargs: Any,
595
+ ) -> Union[str, Iterator[Optional[str]]]:
596
+ prompt_parts = [{"text": prompt}]
597
+ if media:
598
+ for m in media:
599
+ prompt_parts.append(self._convert_media_part(m, **kwargs))
600
+
601
+ tmp_kwargs = self.kwargs | kwargs
602
+ generation_config = self._create_generation_config(tmp_kwargs)
603
+
604
+ if tmp_kwargs.get("stream"):
605
+
606
+ def f() -> Iterator[Optional[str]]:
607
+ response_stream = self.client.models.generate_content_stream(
608
+ model=self.model_name,
609
+ contents=prompt_parts,
610
+ config=generation_config,
611
+ )
612
+ for chunk in response_stream:
613
+ if chunk.text:
614
+ yield chunk.text
615
+
616
+ return f()
617
+ else:
618
+ response = self.client.models.generate_content(
619
+ model=self.model_name,
620
+ contents=prompt_parts,
621
+ config=generation_config,
622
+ )
623
+ return cast(str, response.text)
624
+
625
+ def _convert_message_parts(
626
+ self, message: Dict[str, Any], **kwargs: Any
627
+ ) -> List[Any]:
628
+ parts = [{"text": message["content"]}]
629
+ if "media" in message:
630
+ for media_path in message["media"]:
631
+ parts.append(self._convert_media_part(media_path, **kwargs))
632
+ return parts
633
+
634
+ def _convert_media_part(self, media: Union[str, Path], **kwargs: Any) -> types.Part:
635
+ resize = kwargs.get("resize", self.image_size)
636
+ encoded_media = encode_media(str(media), resize=resize)
637
+
638
+ if encoded_media.startswith("data:image/"):
639
+ encoded_media = encoded_media.split(",", 1)[-1]
640
+
641
+ binary_data = base64.b64decode(encoded_media)
642
+
643
+ return types.Part.from_bytes(
644
+ data=binary_data,
645
+ mime_type="image/png",
646
+ )
647
+
648
+ def _create_generation_config(
649
+ self, kwargs: Dict[str, Any]
650
+ ) -> types.GenerateContentConfig:
651
+ # Extract generation-specific parameters
652
+ config_params = {}
653
+
654
+ # Handle known parameters
655
+ for param in [
656
+ "max_output_tokens",
657
+ "temperature",
658
+ "top_p",
659
+ "top_k",
660
+ "response_mime_type",
661
+ "stop_sequences",
662
+ "candidate_count",
663
+ "seed",
664
+ "safety_settings",
665
+ "system_instruction",
666
+ ]:
667
+ if param in kwargs:
668
+ config_params[param] = kwargs[param]
669
+
670
+ # Create a GenerateContentConfig object
671
+ return types.GenerateContentConfig(**config_params)
@@ -2959,13 +2959,10 @@ def gemini_image_generation(
2959
2959
  return image
2960
2960
  else:
2961
2961
  try:
2962
- _LOGGER.warning("All retries failed; prompting for fresh generation.")
2963
- time.sleep(10)
2964
- output_image_bytes = try_generate_content(
2965
- types.Content(parts=[types.Part(text="Generate an image.")]),
2966
- num_retries=1,
2967
- )
2968
-
2962
+ current_dir = os.path.dirname(os.path.abspath(__file__))
2963
+ img_path = os.path.join(current_dir, "../../assets/gemini.png")
2964
+ with open(img_path, "rb") as img_file:
2965
+ output_image_bytes = img_file.read()
2969
2966
  except Exception as e:
2970
2967
  raise ValueError(f"Fallback generation failed: {str(e)}")
2971
2968
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.11
3
+ Version: 1.1.13
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -1,5 +1,5 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/.sim_tools/df.csv,sha256=e87wnyPtPNo2lw6kPQmU9RnzGq81Q7-YXvrPox1GRYA,41876
2
+ vision_agent/.sim_tools/df.csv,sha256=fLh8HN76ezbOXZUoZbnkhNi5vvjYif2jSblHtRdY8dY,41875
3
3
  vision_agent/.sim_tools/embs.npy,sha256=uUPZ6QuCAr8JAtFa1L9ndAag5ycptIeJ2I8P9U8Y6YY,245888
4
4
  vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
5
5
  vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
@@ -19,7 +19,7 @@ vision_agent/configs/openai_config.py,sha256=Bw7ElBYmBcaZttyRBoNpcy3uTkqg5qADk8L
19
19
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
21
21
  vision_agent/lmm/__init__.py,sha256=4qX2lmGnKWHeKftXueEi9xj_ieK2nQh_ipHf72nKGFk,84
22
- vision_agent/lmm/lmm.py,sha256=utGJMeGEKImqHrY0q9kGu0uK3owG8wKyDustwrDrLto,19421
22
+ vision_agent/lmm/lmm.py,sha256=w23nWSmUiW1rxfRC-Td44-UR3-8k0ey80-0SVZraeqA,23681
23
23
  vision_agent/models/__init__.py,sha256=eIP0pD5dYog8zUA7uuTmUxCF6SIutbLRLRE0cmuCJgQ,326
24
24
  vision_agent/models/agent_types.py,sha256=vBZ9-ns5lHDdFMO7ulCGGeZ6OwRo3gK4O3vN0814IWc,3064
25
25
  vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1cXmw,305
@@ -30,7 +30,7 @@ vision_agent/tools/__init__.py,sha256=PRUka2eqHwPWJxwfpLj-O2Ab7hXG_dsE1Aov3TE6te
30
30
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
31
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=K2QW-0esNBygtlRcyUwVRRhV1FV5UaznHYhFkFZkL-I,130851
33
+ vision_agent/tools/tools.py,sha256=A1YpJuarR1P9ZLnCuakxLiUUtYsnlrvfwlUrkBey_FU,130803
34
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
36
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
40
40
  vision_agent/utils/tools_doc.py,sha256=PKcXXbJktiuPi9q6Q1zXzFx24Dh229SNgWBDtZ2fQSQ,2730
41
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.1.11.dist-info/METADATA,sha256=23g2wi-mLe8lh7zchrhAruxqsyJ1nKD8ILKO4s11Kfw,12673
44
- vision_agent-1.1.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
- vision_agent-1.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- vision_agent-1.1.11.dist-info/RECORD,,
43
+ vision_agent-1.1.13.dist-info/METADATA,sha256=1LVRyxXfxT_eGGfpgK5fioWESB6FWx4LDm_xylNpZdY,12673
44
+ vision_agent-1.1.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ vision_agent-1.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
+ vision_agent-1.1.13.dist-info/RECORD,,