vision-agent 0.2.153__tar.gz → 0.2.154__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.153 → vision_agent-0.2.154}/PKG-INFO +1 -1
  2. {vision_agent-0.2.153 → vision_agent-0.2.154}/pyproject.toml +1 -1
  3. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_coder_prompts.py +5 -5
  4. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_prompts.py +7 -7
  5. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/__init__.py +2 -1
  6. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/meta_tools.py +2 -2
  7. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tools.py +6 -7
  8. {vision_agent-0.2.153 → vision_agent-0.2.154}/LICENSE +0 -0
  9. {vision_agent-0.2.153 → vision_agent-0.2.154}/README.md +0 -0
  10. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/__init__.py +0 -0
  12. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/agent.py +0 -0
  13. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/agent_utils.py +0 -0
  14. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent.py +0 -0
  15. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_coder.py +0 -0
  16. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/clients/__init__.py +0 -0
  17. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/clients/http.py +0 -0
  18. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/clients/landing_public_api.py +0 -0
  19. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/lmm/__init__.py +0 -0
  22. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/lmm/lmm.py +0 -0
  23. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/lmm/types.py +0 -0
  24. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/prompts.py +0 -0
  25. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tool_utils.py +0 -0
  26. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tools_types.py +0 -0
  27. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/__init__.py +0 -0
  28. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/exceptions.py +0 -0
  29. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/execute.py +0 -0
  30. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/image_utils.py +0 -0
  31. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/sim.py +0 -0
  32. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/type_defs.py +0 -0
  33. {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.153
3
+ Version: 0.2.154
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.153"
7
+ version = "0.2.154"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -101,7 +101,7 @@ plan1:
101
101
  - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
102
102
  plan2:
103
103
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
104
- - Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
104
+ - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
105
105
  plan3:
106
106
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
107
107
  - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
109
109
 
110
110
  ```python
111
111
  import numpy as np
112
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
112
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
113
113
 
114
114
  # sample at 1 FPS and use the first 10 frames to reduce processing time
115
115
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
143
143
  owl_v2_counts = get_counts(owl_v2_out)
144
144
 
145
145
  # plan2
146
- florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
146
+ florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
147
147
  florence2_counts = get_counts(florence2_out)
148
148
 
149
149
  # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
153
153
 
154
154
  final_out = {{
155
155
  "owl_v2_video": owl_v2_out,
156
- "florence2_phrase_grounding_image": florence2_out,
156
+ "florence2_phrase_grounding": florence2_out,
157
157
  "florence2_sam2_video_tracking": f2s2_out,
158
158
  }}
159
159
 
160
160
  counts = {{
161
161
  "owl_v2_video": owl_v2_counts,
162
- "florence2_phrase_grounding_image": florence2_counts,
162
+ "florence2_phrase_grounding": florence2_counts,
163
163
  "florence2_sam2_video_tracking": f2s2_counts,
164
164
  }}
165
165
 
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
131
131
 
132
132
  OBSERVATION:
133
133
  [Artifact code.py]
134
- 0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
134
+ 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
135
135
  1|def count_workers_with_helmets(image_path: str, output_path: str):
136
136
  2| image = load_image(image_path)
137
- 3| detections = florence2_phrase_grounding_image("worker, helmet", image)
137
+ 3| detections = florence2_phrase_grounding("worker, helmet", image)
138
138
  4| workers = [d for d in detections if d['label'] == 'worker']
139
139
  5| helmets = [d for d in detections if d['label'] == 'helmet']
140
140
  6| count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
166
166
  OBSERVATION:
167
167
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
168
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
169
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
170
 
171
171
  OBSERVATION:
172
172
  [Artifact code.py edits]
173
173
  ---
174
174
  +++
175
175
  @@ -1,7 +1,7 @@
176
- from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
176
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
177
177
  def count_workers_with_helmets(image_path: str, output_path: str):
178
178
  image = load_image(image_path)
179
- - detections = florence2_phrase_grounding_image("worker, helmet", image)
180
- + detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
179
+ - detections = florence2_phrase_grounding("worker, helmet", image)
180
+ + detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
181
181
  workers = [d for d in detections if d['label'] == 'worker']
182
182
  helmets = [d for d in detections if d['label'] == 'helmet']
183
183
  count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
189
189
  ----- stdout -----
190
190
  3
191
191
 
192
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
192
+ AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
193
193
  """
@@ -24,7 +24,8 @@ from .tools import (
24
24
  extract_frames_and_timestamps,
25
25
  florence2_image_caption,
26
26
  florence2_ocr,
27
- florence2_phrase_grounding_image,
27
+ florence2_phrase_grounding,
28
+ florence2_phrase_grounding_video,
28
29
  florence2_roberta_vqa,
29
30
  florence2_sam2_image,
30
31
  florence2_sam2_video_tracking,
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
668
668
 
669
669
  patterns_with_fine_tune_id = [
670
670
  (
671
- r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
- lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
671
+ r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
+ lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
673
  ),
674
674
  (
675
675
  r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
@@ -1143,10 +1143,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1143
1143
  return answer[task] # type: ignore
1144
1144
 
1145
1145
 
1146
- def florence2_phrase_grounding_image(
1146
+ def florence2_phrase_grounding(
1147
1147
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1148
1148
  ) -> List[Dict[str, Any]]:
1149
- """'florence2_phrase_grounding_image' will run florence2 on a image. It can
1149
+ """'florence2_phrase_grounding' will run florence2 on a image. It can
1150
1150
  detect multiple objects given a text prompt which can be object names or caption.
1151
1151
  You can optionally separate the object names in the text with commas. It returns
1152
1152
  a list of bounding boxes with normalized coordinates, label names and associated
@@ -1167,7 +1167,7 @@ def florence2_phrase_grounding_image(
1167
1167
 
1168
1168
  Example
1169
1169
  -------
1170
- >>> florence2_phrase_grounding_image('person looking at a coyote', image)
1170
+ >>> florence2_phrase_grounding('person looking at a coyote', image)
1171
1171
  [
1172
1172
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1173
1173
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1196,7 +1196,7 @@ def florence2_phrase_grounding_image(
1196
1196
  "florence2-ft",
1197
1197
  v2=True,
1198
1198
  is_form=True,
1199
- metadata_payload={"function_name": "florence2_phrase_grounding_image"},
1199
+ metadata_payload={"function_name": "florence2_phrase_grounding"},
1200
1200
  )
1201
1201
  # get the first frame
1202
1202
  detection = detections[0]
@@ -1205,7 +1205,7 @@ def florence2_phrase_grounding_image(
1205
1205
  "image": image_b64,
1206
1206
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1207
1207
  "prompt": prompt,
1208
- "function_name": "florence2_phrase_grounding_image",
1208
+ "function_name": "florence2_phrase_grounding",
1209
1209
  }
1210
1210
  detections = send_inference_request(data, "florence2", v2=True)
1211
1211
  detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
@@ -2164,8 +2164,7 @@ FUNCTION_TOOLS = [
2164
2164
  florence2_ocr,
2165
2165
  florence2_sam2_image,
2166
2166
  florence2_sam2_video_tracking,
2167
- florence2_phrase_grounding_image,
2168
- florence2_phrase_grounding_video,
2167
+ florence2_phrase_grounding,
2169
2168
  ixc25_image_vqa,
2170
2169
  ixc25_video_vqa,
2171
2170
  detr_segmentation,
File without changes
File without changes