vision-agent 1.1.12__tar.gz → 1.1.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {vision_agent-1.1.12 → vision_agent-1.1.13}/PKG-INFO +1 -1
  2. {vision_agent-1.1.12 → vision_agent-1.1.13}/pyproject.toml +1 -1
  3. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/.sim_tools/df.csv +1 -1
  4. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/lmm/lmm.py +139 -11
  5. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/tools/tools.py +4 -7
  6. {vision_agent-1.1.12 → vision_agent-1.1.13}/.gitignore +0 -0
  7. {vision_agent-1.1.12 → vision_agent-1.1.13}/LICENSE +0 -0
  8. {vision_agent-1.1.12 → vision_agent-1.1.13}/README.md +0 -0
  9. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/.sim_tools/embs.npy +0 -0
  10. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/__init__.py +0 -0
  11. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/README.md +0 -0
  12. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/__init__.py +0 -0
  13. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/agent.py +0 -0
  14. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  15. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  16. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  17. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  18. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  19. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/agent/vision_agent_v2.py +0 -0
  20. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/clients/__init__.py +0 -0
  21. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/clients/http.py +0 -0
  22. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/configs/__init__.py +0 -0
  23. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/configs/anthropic_config.py +0 -0
  24. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/configs/config.py +0 -0
  25. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/configs/openai_config.py +0 -0
  26. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/fonts/__init__.py +0 -0
  27. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  28. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/lmm/__init__.py +0 -0
  29. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/models/__init__.py +0 -0
  30. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/models/agent_types.py +0 -0
  31. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/models/lmm_types.py +0 -0
  32. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/models/tools_types.py +0 -0
  33. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/sim/__init__.py +0 -0
  34. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/sim/sim.py +0 -0
  35. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/tools/__init__.py +0 -0
  36. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/tools/meta_tools.py +0 -0
  37. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/tools/planner_tools.py +0 -0
  38. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/tools/prompts.py +0 -0
  39. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/__init__.py +0 -0
  40. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/agent.py +0 -0
  41. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/exceptions.py +0 -0
  42. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/execute.py +0 -0
  43. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/image_utils.py +0 -0
  44. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/tools.py +0 -0
  45. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/tools_doc.py +0 -0
  46. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/video.py +0 -0
  47. {vision_agent-1.1.12 → vision_agent-1.1.13}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.12
3
+ Version: 1.1.13
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "vision-agent"
7
- version = "1.1.12"
7
+ version = "1.1.13"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = [{ name = "Landing AI", email = "dev@landing.ai" }]
10
10
  requires-python = ">=3.9,<4.0"
@@ -718,4 +718,4 @@ desc,doc,name
718
718
  [0, 0, 0, ..., 0, 0, 0],
719
719
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
720
720
  }],
721
- )",overlay_segmentation_masks
721
+ )",overlay_segmentation_masks
@@ -3,12 +3,16 @@ import os
3
3
  from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
5
  from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
6
+ import base64
6
7
 
7
8
  import anthropic
8
9
  import requests
9
10
  from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
10
11
  from openai import AzureOpenAI, OpenAI
11
12
 
13
+ from google import genai # type: ignore
14
+ from google.genai import types # type: ignore
15
+
12
16
  from vision_agent.models import Message
13
17
  from vision_agent.utils.image_utils import encode_media
14
18
 
@@ -516,28 +520,152 @@ class AnthropicLMM(LMM):
516
520
  return cast(str, response.content[0].text)
517
521
 
518
522
 
519
- class GoogleLMM(OpenAILMM):
523
+ class GoogleLMM(LMM):
520
524
  r"""An LMM class for the Google LMMs."""
521
525
 
522
526
  def __init__(
523
527
  self,
528
+ model_name: str = "gemini-2.5-pro-preview-03-25",
524
529
  api_key: Optional[str] = None,
525
- model_name: str = "gemini-2.0-flash-exp",
526
- max_tokens: int = 4096,
527
- image_detail: str = "low",
528
530
  image_size: int = 768,
531
+ image_detail: str = "low",
529
532
  **kwargs: Any,
530
533
  ):
531
- base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
532
534
  if not api_key:
533
- api_key = os.environ.get("GEMINI_API_KEY")
534
-
535
- self.client = OpenAI(api_key=api_key, base_url=base_url)
535
+ api_key = os.environ.get("GOOGLE_API_KEY")
536
536
 
537
+ # Create the client using the Google Genai client
538
+ self.client = genai.Client(api_key=api_key)
537
539
  self.model_name = model_name
538
540
  self.image_size = image_size
539
541
  self.image_detail = image_detail
540
-
541
- if "max_tokens" not in kwargs:
542
- kwargs["max_tokens"] = max_tokens
543
542
  self.kwargs = kwargs
543
+
544
+ def __call__(
545
+ self,
546
+ input: Union[str, Sequence[Dict[str, Any]]],
547
+ **kwargs: Any,
548
+ ) -> Union[str, Iterator[Optional[str]]]:
549
+ if isinstance(input, str):
550
+ return self.generate(input, **kwargs)
551
+ return self.chat(input, **kwargs)
552
+
553
+ def chat(
554
+ self,
555
+ chat: Sequence[Dict[str, Any]],
556
+ **kwargs: Any,
557
+ ) -> Union[str, Iterator[Optional[str]]]:
558
+ prompt_parts = []
559
+ for message in chat:
560
+ if message["role"] != "user":
561
+ continue # Gemini expects only user input
562
+ prompt_parts.extend(self._convert_message_parts(message, **kwargs))
563
+
564
+ tmp_kwargs = self.kwargs | kwargs
565
+ generation_config = self._create_generation_config(tmp_kwargs)
566
+
567
+ if tmp_kwargs.get("stream"):
568
+
569
+ def f() -> Iterator[Optional[str]]:
570
+ # Use the client to stream content
571
+ response_stream = self.client.models.generate_content_stream(
572
+ model=self.model_name,
573
+ contents=prompt_parts,
574
+ config=generation_config,
575
+ )
576
+ for chunk in response_stream:
577
+ if chunk.text:
578
+ yield chunk.text
579
+
580
+ return f()
581
+ else:
582
+ # Use the client for non-streaming
583
+ response = self.client.models.generate_content(
584
+ model=self.model_name,
585
+ contents=prompt_parts,
586
+ config=generation_config,
587
+ )
588
+ return cast(str, response.text)
589
+
590
+ def generate(
591
+ self,
592
+ prompt: str,
593
+ media: Optional[Sequence[Union[str, Path]]] = None,
594
+ **kwargs: Any,
595
+ ) -> Union[str, Iterator[Optional[str]]]:
596
+ prompt_parts = [{"text": prompt}]
597
+ if media:
598
+ for m in media:
599
+ prompt_parts.append(self._convert_media_part(m, **kwargs))
600
+
601
+ tmp_kwargs = self.kwargs | kwargs
602
+ generation_config = self._create_generation_config(tmp_kwargs)
603
+
604
+ if tmp_kwargs.get("stream"):
605
+
606
+ def f() -> Iterator[Optional[str]]:
607
+ response_stream = self.client.models.generate_content_stream(
608
+ model=self.model_name,
609
+ contents=prompt_parts,
610
+ config=generation_config,
611
+ )
612
+ for chunk in response_stream:
613
+ if chunk.text:
614
+ yield chunk.text
615
+
616
+ return f()
617
+ else:
618
+ response = self.client.models.generate_content(
619
+ model=self.model_name,
620
+ contents=prompt_parts,
621
+ config=generation_config,
622
+ )
623
+ return cast(str, response.text)
624
+
625
+ def _convert_message_parts(
626
+ self, message: Dict[str, Any], **kwargs: Any
627
+ ) -> List[Any]:
628
+ parts = [{"text": message["content"]}]
629
+ if "media" in message:
630
+ for media_path in message["media"]:
631
+ parts.append(self._convert_media_part(media_path, **kwargs))
632
+ return parts
633
+
634
+ def _convert_media_part(self, media: Union[str, Path], **kwargs: Any) -> types.Part:
635
+ resize = kwargs.get("resize", self.image_size)
636
+ encoded_media = encode_media(str(media), resize=resize)
637
+
638
+ if encoded_media.startswith("data:image/"):
639
+ encoded_media = encoded_media.split(",", 1)[-1]
640
+
641
+ binary_data = base64.b64decode(encoded_media)
642
+
643
+ return types.Part.from_bytes(
644
+ data=binary_data,
645
+ mime_type="image/png",
646
+ )
647
+
648
+ def _create_generation_config(
649
+ self, kwargs: Dict[str, Any]
650
+ ) -> types.GenerateContentConfig:
651
+ # Extract generation-specific parameters
652
+ config_params = {}
653
+
654
+ # Handle known parameters
655
+ for param in [
656
+ "max_output_tokens",
657
+ "temperature",
658
+ "top_p",
659
+ "top_k",
660
+ "response_mime_type",
661
+ "stop_sequences",
662
+ "candidate_count",
663
+ "seed",
664
+ "safety_settings",
665
+ "system_instruction",
666
+ ]:
667
+ if param in kwargs:
668
+ config_params[param] = kwargs[param]
669
+
670
+ # Create a GenerateContentConfig object
671
+ return types.GenerateContentConfig(**config_params)
@@ -2959,13 +2959,10 @@ def gemini_image_generation(
2959
2959
  return image
2960
2960
  else:
2961
2961
  try:
2962
- _LOGGER.warning("All retries failed; prompting for fresh generation.")
2963
- time.sleep(10)
2964
- output_image_bytes = try_generate_content(
2965
- types.Content(parts=[types.Part(text="Generate an image.")]),
2966
- num_retries=1,
2967
- )
2968
-
2962
+ current_dir = os.path.dirname(os.path.abspath(__file__))
2963
+ img_path = os.path.join(current_dir, "../../assets/gemini.png")
2964
+ with open(img_path, "rb") as img_file:
2965
+ output_image_bytes = img_file.read()
2969
2966
  except Exception as e:
2970
2967
  raise ValueError(f"Fallback generation failed: {str(e)}")
2971
2968
 
File without changes
File without changes
File without changes