vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1488,8 +1488,8 @@ def agentic_object_detection(
1488
1488
  """'agentic_object_detection' is a tool that can detect multiple objects given a
1489
1489
  text prompt such as object names or referring expressions on images. It's
1490
1490
  particularly good at detecting specific objects given detailed descriptive prompts
1491
- but runs slower. It returns a list of bounding boxes with normalized coordinates,
1492
- label names and associated probability scores.
1491
+ but runs slower so not ideal for high counts. It returns a list of bounding boxes
1492
+ with normalized coordinates, label names and associated confidence score of 1.0.
1493
1493
 
1494
1494
  Parameters:
1495
1495
  prompt (str): The prompt to ground to the image, only supports a single prompt
@@ -1533,8 +1533,9 @@ def agentic_sam2_instance_segmentation(
1533
1533
  """'agentic_sam2_instance_segmentation' is a tool that can detect multiple
1534
1534
  instances given a text prompt such as object names or referring expressions on
1535
1535
  images. It's particularly good at detecting specific objects given detailed
1536
- descriptive prompts but runs slower. It returns a list of bounding boxes with
1537
- normalized coordinates, label names, masks and associated probability scores.
1536
+ descriptive prompts but runs slower so not ideal for high counts. It returns a list
1537
+ of bounding boxes with normalized coordinates, label names, masks and associated
1538
+ confidence score of 1.0.
1538
1539
 
1539
1540
  Parameters:
1540
1541
  prompt (str): The object that needs to be counted, only supports a single
@@ -1591,9 +1592,9 @@ def agentic_sam2_video_tracking(
1591
1592
  """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
1592
1593
  objects in a video given a text prompt such as object names or referring
1593
1594
  expressions. It's particularly good at detecting specific objects given detailed
1594
- descriptive prompts but runs slower, and returns a list of bounding boxes, label
1595
- names, masks and associated probability scores and is useful for tracking and
1596
- counting without duplicating counts.
1595
+ descriptive prompts but runs slower so not ideal for high counts. It returns a list
1596
+ of bounding boxes, label names, masks and associated confidence score of 1.0 and is
1597
+ useful for tracking and counting without duplicating counts.
1597
1598
 
1598
1599
  Parameters:
1599
1600
  prompt (str): The prompt to ground to the image, only supports a single prompt
@@ -2307,22 +2308,10 @@ def _qwenvl_activity_recognition(
2307
2308
  return [0.0] * len(segment)
2308
2309
 
2309
2310
 
2310
- def _qwen2vl_activity_recognition(
2311
- segment: List[np.ndarray], prompt: str
2312
- ) -> List[float]:
2313
- return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
2314
-
2315
-
2316
- def _qwen25vl_activity_recognition(
2317
- segment: List[np.ndarray], prompt: str
2318
- ) -> List[float]:
2319
- return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
2320
-
2321
-
2322
2311
  def activity_recognition(
2323
2312
  prompt: str,
2324
2313
  frames: List[np.ndarray],
2325
- model: str = "qwen2vl",
2314
+ model: str = "qwen25vl",
2326
2315
  chunk_length_frames: int = 10,
2327
2316
  ) -> List[float]:
2328
2317
  """'activity_recognition' is a tool that can recognize activities in a video given a
@@ -2371,12 +2360,12 @@ def activity_recognition(
2371
2360
  elif model == "qwen2vl":
2372
2361
 
2373
2362
  def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2374
- return _qwen2vl_activity_recognition(segment, prompt)
2363
+ return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
2375
2364
 
2376
2365
  elif model == "qwen25vl":
2377
2366
 
2378
2367
  def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
2379
- return _qwen25vl_activity_recognition(segment, prompt)
2368
+ return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
2380
2369
 
2381
2370
  else:
2382
2371
  raise ValueError(f"Invalid model: {model}")
@@ -3488,9 +3477,9 @@ def _plot_counting(
3488
3477
 
3489
3478
 
3490
3479
  FUNCTION_TOOLS = [
3491
- owlv2_object_detection,
3492
- owlv2_sam2_instance_segmentation,
3493
- owlv2_sam2_video_tracking,
3480
+ glee_object_detection,
3481
+ glee_sam2_instance_segmentation,
3482
+ glee_sam2_video_tracking,
3494
3483
  countgd_object_detection,
3495
3484
  countgd_sam2_instance_segmentation,
3496
3485
  countgd_sam2_video_tracking,
@@ -3502,8 +3491,8 @@ FUNCTION_TOOLS = [
3502
3491
  document_extraction,
3503
3492
  document_qa,
3504
3493
  ocr,
3505
- qwen2_vl_images_vqa,
3506
- qwen2_vl_video_vqa,
3494
+ qwen25_vl_images_vqa,
3495
+ qwen25_vl_video_vqa,
3507
3496
  activity_recognition,
3508
3497
  depth_anything_v2,
3509
3498
  generate_pose_image,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 1.0.4
3
+ Version: 1.0.7
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -65,10 +65,10 @@ pip install vision-agent
65
65
 
66
66
  ```bash
67
67
  export ANTHROPIC_API_KEY="your-api-key"
68
- export OPENAI_API_KEY="your-api-key"
68
+ export GEMINI_API_KEY="your-api-key"
69
69
  ```
70
70
 
71
- > **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
71
+ > **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
72
72
 
73
73
  You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
74
74
 
@@ -147,5 +147,33 @@ directory. For example to change to Anthropic simply just run:
147
147
  cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
148
148
  ```
149
149
 
150
+ You can also modify the existing `config.py` file yourself to use a different LLM
151
+ provider, for example if you wanted to change the planner from Anthropic inside
152
+ `config.py` to OpenAI you would replace this code:
153
+ ```python
154
+ planner: Type[LMM] = Field(default=AnthropicLMM)
155
+ planner_kwargs: dict = Field(
156
+ default_factory=lambda: {
157
+ "model_name": "claude-3-7-sonnet-20250219",
158
+ "temperature": 0.0,
159
+ "image_size": 768,
160
+ }
161
+ )
162
+ ```
163
+
164
+ with this code:
165
+
166
+ ```python
167
+ planner: Type[LMM] = Field(default=OpenAILMM)
168
+ planner_kwargs: dict = Field(
169
+ default_factory=lambda: {
170
+ "model_name": "gpt-4o-2024-11-20",
171
+ "temperature": 0.0,
172
+ "image_size": 768,
173
+ "image_detail": "low",
174
+ }
175
+ )
176
+ ```
177
+
150
178
  > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
151
179
 
@@ -1,43 +1,36 @@
1
- vision_agent/.sim_tools/df.csv,sha256=mIr1iubLDqGsL3K3ab6bmh6PtLvmOpvnaIX28lxdV6c,40706
2
- vision_agent/.sim_tools/embs.npy,sha256=pZZMFMg0rkIAOpMOjN7gjD58hPK07c2ylfQ9YST8xFA,245888
1
+ vision_agent/.sim_tools/df.csv,sha256=jCyBDlLxI9_yAxzLZcoN2BPpveF1yh29AlfdSAGTZ4A,40842
2
+ vision_agent/.sim_tools/embs.npy,sha256=QN8Ojc0Mv4_OS6WA4elvBhXTDHcpx2g1pLxsGqk4IQU,245888
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
5
- vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
5
+ vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
6
6
  vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
7
- vision_agent/agent/vision_agent.py,sha256=lLNIapK7lhbGSPdF2RLIh7wBNZMGiM60kFaLXkgXV8g,23465
8
- vision_agent/agent/vision_agent_coder.py,sha256=SQy83t3b2vXFiYV4ynlT-g7AkUIP8zy03culkAuHMHA,27353
9
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=D4RJxTWoxpl-WtYRvHNxaLSdWVHsdYb0jJIQ2ZCGU0A,12277
10
7
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
11
8
  vision_agent/agent/vision_agent_coder_v2.py,sha256=ELc_J8Q4NKPs7YETu3a9O0Vk1zN3k6QfHBgu0M0IWGk,17450
12
- vision_agent/agent/vision_agent_planner.py,sha256=8LeUsxUlGuQMqCdJ6jHXOe6RpKpzLtMW1AaJlfuIfzw,18680
13
- vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
14
- vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=TiiF5BGnFVraFlQnDaeRU67927LvszvpcMUOgVgo0ps,35843
15
- vision_agent/agent/vision_agent_planner_v2.py,sha256=FuvTDe1mfznh4lkDGUyXa6yip8zQV5TDSxUG5zrsWc0,22010
16
- vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
9
+ vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=YARVphHKLMNUqCeOsrManvgecl77RP1g51vtt7JpdWk,35937
10
+ vision_agent/agent/vision_agent_planner_v2.py,sha256=Aww_BJhTFKZ5XjYe8FW57z2Gwp2se0vg1t1DKLGRAyQ,22050
17
11
  vision_agent/agent/vision_agent_prompts_v2.py,sha256=OnHmmteAj8__2spEXNcckApQvEpkK_JIZd_SWzEyg9c,4230
18
12
  vision_agent/agent/vision_agent_v2.py,sha256=iPW6DowH7wCFIA5vb1SdSLfZFWbn_oSC7Xa8uO8KIJI,11675
19
13
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
14
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
21
15
  vision_agent/configs/__init__.py,sha256=Iu75-w9_nlPmnB_qKA7nYaaaHf7xtTrDmK8N4v2WV34,27
22
- vision_agent/configs/anthropic_config.py,sha256=T1UuESgiY8913A6wA42P7-cg8FTk9-LkJpyywo7OnIQ,4298
23
- vision_agent/configs/anthropic_openai_config.py,sha256=rUz5zca4Pn5dTUwJXiJzRDYua5PWizApCKI3y0zOvhc,4699
24
- vision_agent/configs/config.py,sha256=rUz5zca4Pn5dTUwJXiJzRDYua5PWizApCKI3y0zOvhc,4699
25
- vision_agent/configs/openai_config.py,sha256=v2_AIY89d7LKWn4uqA2G047U2IdmnqZrGH2Iww9gRIw,4498
16
+ vision_agent/configs/anthropic_config.py,sha256=FBW3PsrZn4gJXhr-v1GGayuJuBPafzDz2Q4vyI_rBzA,4692
17
+ vision_agent/configs/config.py,sha256=FjpYViBJHMizgoZ3dp7T4Lbhbh1MHHstaG7icVdVsrU,4694
18
+ vision_agent/configs/openai_config.py,sha256=Bw7ElBYmBcaZttyRBoNpcy3uTkqg5qADk8LP0wse2NQ,4498
26
19
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
20
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
28
21
  vision_agent/lmm/__init__.py,sha256=4qX2lmGnKWHeKftXueEi9xj_ieK2nQh_ipHf72nKGFk,84
29
- vision_agent/lmm/lmm.py,sha256=XYp1frrqQ-6q-0y2IWwM8-EIH5UrFZ21SAhkcM32J9w,19355
22
+ vision_agent/lmm/lmm.py,sha256=utGJMeGEKImqHrY0q9kGu0uK3owG8wKyDustwrDrLto,19421
30
23
  vision_agent/models/__init__.py,sha256=eIP0pD5dYog8zUA7uuTmUxCF6SIutbLRLRE0cmuCJgQ,326
31
24
  vision_agent/models/agent_types.py,sha256=vBZ9-ns5lHDdFMO7ulCGGeZ6OwRo3gK4O3vN0814IWc,3064
32
25
  vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1cXmw,305
33
26
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
34
27
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
35
28
  vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
36
- vision_agent/tools/__init__.py,sha256=H7FWx0OXGVIjrSOTpNH-YwE4LBuOfThZTG-SHFpo_Z8,2576
37
- vision_agent/tools/meta_tools.py,sha256=DNRXHX9nZ1GBeqeLiq87sBshoe0aiZeYasETbG-9neI,24053
38
- vision_agent/tools/planner_tools.py,sha256=mlpUODpsN9sg-OKdsBm-6maK3eA97FUgAWm1etfgx4E,19553
29
+ vision_agent/tools/__init__.py,sha256=H8M5v--cANBiOWvAfUJNj9cq9PKm_DjRrG1MeNRWpHs,2434
30
+ vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
+ vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
39
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
40
- vision_agent/tools/tools.py,sha256=-jBrykNYPinRpDXnBsnzlSgJ_hbZClzCp3pkzWjTUxs,122098
33
+ vision_agent/tools/tools.py,sha256=4gwL8EFMwm6l0MujftJ8G8BO2z8Dh_a4FPjy_xUmYqs,121889
41
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
42
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
43
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -47,7 +40,7 @@ vision_agent/utils/tools.py,sha256=_XGcF-Zd527BWNkGopBZeJ4j5ehreAkNsKQSBFrlvjw,8
47
40
  vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
48
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
49
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
50
- vision_agent-1.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
51
- vision_agent-1.0.4.dist-info/METADATA,sha256=-m5UEp_BinJMo7IQqv63frcNGAQ9dUICs8bGnpZD_Ac,5940
52
- vision_agent-1.0.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
53
- vision_agent-1.0.4.dist-info/RECORD,,
43
+ vision_agent-1.0.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-1.0.7.dist-info/METADATA,sha256=aQSPKBzRC6Ai7_kZWAn72h2EsYpDVEHMcDAWCV2H-ho,6746
45
+ vision_agent-1.0.7.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-1.0.7.dist-info/RECORD,,