vision-agent 0.2.153__py3-none-any.whl → 0.2.155__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,7 +101,7 @@ plan1:
101
101
  - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
102
102
  plan2:
103
103
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
104
- - Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
104
+ - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
105
105
  plan3:
106
106
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
107
107
  - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
109
109
 
110
110
  ```python
111
111
  import numpy as np
112
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
112
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
113
113
 
114
114
  # sample at 1 FPS and use the first 10 frames to reduce processing time
115
115
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
143
143
  owl_v2_counts = get_counts(owl_v2_out)
144
144
 
145
145
  # plan2
146
- florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
146
+ florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
147
147
  florence2_counts = get_counts(florence2_out)
148
148
 
149
149
  # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
153
153
 
154
154
  final_out = {{
155
155
  "owl_v2_video": owl_v2_out,
156
- "florence2_phrase_grounding_image": florence2_out,
156
+ "florence2_phrase_grounding": florence2_out,
157
157
  "florence2_sam2_video_tracking": f2s2_out,
158
158
  }}
159
159
 
160
160
  counts = {{
161
161
  "owl_v2_video": owl_v2_counts,
162
- "florence2_phrase_grounding_image": florence2_counts,
162
+ "florence2_phrase_grounding": florence2_counts,
163
163
  "florence2_sam2_video_tracking": f2s2_counts,
164
164
  }}
165
165
 
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
131
131
 
132
132
  OBSERVATION:
133
133
  [Artifact code.py]
134
- 0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
134
+ 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
135
135
  1|def count_workers_with_helmets(image_path: str, output_path: str):
136
136
  2| image = load_image(image_path)
137
- 3| detections = florence2_phrase_grounding_image("worker, helmet", image)
137
+ 3| detections = florence2_phrase_grounding("worker, helmet", image)
138
138
  4| workers = [d for d in detections if d['label'] == 'worker']
139
139
  5| helmets = [d for d in detections if d['label'] == 'helmet']
140
140
  6| count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
166
166
  OBSERVATION:
167
167
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
168
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
169
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
170
 
171
171
  OBSERVATION:
172
172
  [Artifact code.py edits]
173
173
  ---
174
174
  +++
175
175
  @@ -1,7 +1,7 @@
176
- from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
176
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
177
177
  def count_workers_with_helmets(image_path: str, output_path: str):
178
178
  image = load_image(image_path)
179
- - detections = florence2_phrase_grounding_image("worker, helmet", image)
180
- + detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
179
+ - detections = florence2_phrase_grounding("worker, helmet", image)
180
+ + detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
181
181
  workers = [d for d in detections if d['label'] == 'worker']
182
182
  helmets = [d for d in detections if d['label'] == 'helmet']
183
183
  count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
189
189
  ----- stdout -----
190
190
  3
191
191
 
192
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
192
+ AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
193
193
  """
@@ -24,7 +24,8 @@ from .tools import (
24
24
  extract_frames_and_timestamps,
25
25
  florence2_image_caption,
26
26
  florence2_ocr,
27
- florence2_phrase_grounding_image,
27
+ florence2_phrase_grounding,
28
+ florence2_phrase_grounding_video,
28
29
  florence2_roberta_vqa,
29
30
  florence2_sam2_image,
30
31
  florence2_sam2_video_tracking,
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
668
668
 
669
669
  patterns_with_fine_tune_id = [
670
670
  (
671
- r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
- lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
671
+ r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
+ lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
673
  ),
674
674
  (
675
675
  r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
@@ -700,18 +700,22 @@ def countgd_counting(
700
700
  {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
701
701
  ]
702
702
  """
703
- image_b64 = convert_to_b64(image)
703
+ buffer_bytes = numpy_to_bytes(image)
704
+ files = [("image", buffer_bytes)]
704
705
  prompt = prompt.replace(", ", " .")
705
- payload = {"prompt": prompt, "image": image_b64}
706
+ payload = {"prompts": [prompt], "model": "countgd"}
706
707
  metadata = {"function_name": "countgd_counting"}
707
- resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
708
+ resp_data = send_task_inference_request(
709
+ payload, "text-to-object-detection", files=files, metadata=metadata
710
+ )
711
+ bboxes_per_frame = resp_data[0]
708
712
  bboxes_formatted = [
709
713
  ODResponseData(
710
714
  label=bbox["label"],
711
- bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
715
+ bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
712
716
  score=round(bbox["score"], 2),
713
717
  )
714
- for bbox in resp_data
718
+ for bbox in bboxes_per_frame
715
719
  ]
716
720
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
717
721
  return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -1143,10 +1147,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1143
1147
  return answer[task] # type: ignore
1144
1148
 
1145
1149
 
1146
- def florence2_phrase_grounding_image(
1150
+ def florence2_phrase_grounding(
1147
1151
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1148
1152
  ) -> List[Dict[str, Any]]:
1149
- """'florence2_phrase_grounding_image' will run florence2 on a image. It can
1153
+ """'florence2_phrase_grounding' will run florence2 on a image. It can
1150
1154
  detect multiple objects given a text prompt which can be object names or caption.
1151
1155
  You can optionally separate the object names in the text with commas. It returns
1152
1156
  a list of bounding boxes with normalized coordinates, label names and associated
@@ -1167,7 +1171,7 @@ def florence2_phrase_grounding_image(
1167
1171
 
1168
1172
  Example
1169
1173
  -------
1170
- >>> florence2_phrase_grounding_image('person looking at a coyote', image)
1174
+ >>> florence2_phrase_grounding('person looking at a coyote', image)
1171
1175
  [
1172
1176
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1173
1177
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1196,7 +1200,7 @@ def florence2_phrase_grounding_image(
1196
1200
  "florence2-ft",
1197
1201
  v2=True,
1198
1202
  is_form=True,
1199
- metadata_payload={"function_name": "florence2_phrase_grounding_image"},
1203
+ metadata_payload={"function_name": "florence2_phrase_grounding"},
1200
1204
  )
1201
1205
  # get the first frame
1202
1206
  detection = detections[0]
@@ -1205,7 +1209,7 @@ def florence2_phrase_grounding_image(
1205
1209
  "image": image_b64,
1206
1210
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1207
1211
  "prompt": prompt,
1208
- "function_name": "florence2_phrase_grounding_image",
1212
+ "function_name": "florence2_phrase_grounding",
1209
1213
  }
1210
1214
  detections = send_inference_request(data, "florence2", v2=True)
1211
1215
  detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
@@ -2164,8 +2168,7 @@ FUNCTION_TOOLS = [
2164
2168
  florence2_ocr,
2165
2169
  florence2_sam2_image,
2166
2170
  florence2_sam2_video_tracking,
2167
- florence2_phrase_grounding_image,
2168
- florence2_phrase_grounding_video,
2171
+ florence2_phrase_grounding,
2169
2172
  ixc25_image_vqa,
2170
2173
  ixc25_video_vqa,
2171
2174
  detr_segmentation,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.153
3
+ Version: 0.2.155
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
4
4
  vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
5
5
  vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=Ea_v_qLBJMVwQVLLIdNq15MgV2-6qqhcThHAHFwzv-o,18940
8
- vision_agent/agent/vision_agent_prompts.py,sha256=eOqluRb1R_SJFsdWXd9HJuiJnJccEnDDUkfPXlHOjyw,11293
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
11
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
15
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=cg4Axb9L3Z7WkdyEv5IyqDsmZKIrxmS4CmV3DEXURnU,2418
18
- vision_agent/tools/meta_tools.py,sha256=yrplxiDu-L9_Dw_L2ESehJabckAq59Q-xfMpIbYB0Ak,25179
17
+ vision_agent/tools/__init__.py,sha256=u41fm9KGX1s9DWzVAGnuungEooxH4X8fSDk5hjXvDiY,2450
18
+ vision_agent/tools/meta_tools.py,sha256=FN2oMhXzCzSzmk6Na6uKw1r5-CGO3lCk94izcWNFKwA,25167
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
21
- vision_agent/tools/tools.py,sha256=Of7NTZTc1bim_fdAoDxx47WzttGI8VlMKKcId0sMwfk,78406
21
+ vision_agent/tools/tools.py,sha256=3T5h9dewsqkKu66BlNdBwXnEKNCBl0_FhdHwTNYQolI,78471
22
22
  vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
23
23
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
27
27
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
- vision_agent-0.2.153.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.153.dist-info/METADATA,sha256=zehWh4l1EfZeTKxSEgKXtQMb0EE5pvWP1UG0d2lyS44,13758
32
- vision_agent-0.2.153.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.153.dist-info/RECORD,,
30
+ vision_agent-0.2.155.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.155.dist-info/METADATA,sha256=lueDmQRoKz_BUNDRApWHxege_xxXnPI117OBh1nZJcg,13758
32
+ vision_agent-0.2.155.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.155.dist-info/RECORD,,