vision-agent 0.2.153__py3-none-any.whl → 0.2.155__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -101,7 +101,7 @@ plan1:
101
101
  - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
102
102
  plan2:
103
103
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
104
- - Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
104
+ - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
105
105
  plan3:
106
106
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
107
107
  - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
109
109
 
110
110
  ```python
111
111
  import numpy as np
112
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
112
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
113
113
 
114
114
  # sample at 1 FPS and use the first 10 frames to reduce processing time
115
115
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
143
143
  owl_v2_counts = get_counts(owl_v2_out)
144
144
 
145
145
  # plan2
146
- florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
146
+ florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
147
147
  florence2_counts = get_counts(florence2_out)
148
148
 
149
149
  # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
153
153
 
154
154
  final_out = {{
155
155
  "owl_v2_video": owl_v2_out,
156
- "florence2_phrase_grounding_image": florence2_out,
156
+ "florence2_phrase_grounding": florence2_out,
157
157
  "florence2_sam2_video_tracking": f2s2_out,
158
158
  }}
159
159
 
160
160
  counts = {{
161
161
  "owl_v2_video": owl_v2_counts,
162
- "florence2_phrase_grounding_image": florence2_counts,
162
+ "florence2_phrase_grounding": florence2_counts,
163
163
  "florence2_sam2_video_tracking": f2s2_counts,
164
164
  }}
165
165
 
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
131
131
 
132
132
  OBSERVATION:
133
133
  [Artifact code.py]
134
- 0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
134
+ 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
135
135
  1|def count_workers_with_helmets(image_path: str, output_path: str):
136
136
  2| image = load_image(image_path)
137
- 3| detections = florence2_phrase_grounding_image("worker, helmet", image)
137
+ 3| detections = florence2_phrase_grounding("worker, helmet", image)
138
138
  4| workers = [d for d in detections if d['label'] == 'worker']
139
139
  5| helmets = [d for d in detections if d['label'] == 'helmet']
140
140
  6| count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
166
166
  OBSERVATION:
167
167
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
168
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
169
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
170
 
171
171
  OBSERVATION:
172
172
  [Artifact code.py edits]
173
173
  ---
174
174
  +++
175
175
  @@ -1,7 +1,7 @@
176
- from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
176
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
177
177
  def count_workers_with_helmets(image_path: str, output_path: str):
178
178
  image = load_image(image_path)
179
- - detections = florence2_phrase_grounding_image("worker, helmet", image)
180
- + detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
179
+ - detections = florence2_phrase_grounding("worker, helmet", image)
180
+ + detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
181
181
  workers = [d for d in detections if d['label'] == 'worker']
182
182
  helmets = [d for d in detections if d['label'] == 'helmet']
183
183
  count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
189
189
  ----- stdout -----
190
190
  3
191
191
 
192
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
192
+ AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
193
193
  """
@@ -24,7 +24,8 @@ from .tools import (
24
24
  extract_frames_and_timestamps,
25
25
  florence2_image_caption,
26
26
  florence2_ocr,
27
- florence2_phrase_grounding_image,
27
+ florence2_phrase_grounding,
28
+ florence2_phrase_grounding_video,
28
29
  florence2_roberta_vqa,
29
30
  florence2_sam2_image,
30
31
  florence2_sam2_video_tracking,
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
668
668
 
669
669
  patterns_with_fine_tune_id = [
670
670
  (
671
- r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
- lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
671
+ r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
+ lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
673
  ),
674
674
  (
675
675
  r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
@@ -700,18 +700,22 @@ def countgd_counting(
700
700
  {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
701
701
  ]
702
702
  """
703
- image_b64 = convert_to_b64(image)
703
+ buffer_bytes = numpy_to_bytes(image)
704
+ files = [("image", buffer_bytes)]
704
705
  prompt = prompt.replace(", ", " .")
705
- payload = {"prompt": prompt, "image": image_b64}
706
+ payload = {"prompts": [prompt], "model": "countgd"}
706
707
  metadata = {"function_name": "countgd_counting"}
707
- resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
708
+ resp_data = send_task_inference_request(
709
+ payload, "text-to-object-detection", files=files, metadata=metadata
710
+ )
711
+ bboxes_per_frame = resp_data[0]
708
712
  bboxes_formatted = [
709
713
  ODResponseData(
710
714
  label=bbox["label"],
711
- bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
715
+ bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
712
716
  score=round(bbox["score"], 2),
713
717
  )
714
- for bbox in resp_data
718
+ for bbox in bboxes_per_frame
715
719
  ]
716
720
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
717
721
  return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -1143,10 +1147,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1143
1147
  return answer[task] # type: ignore
1144
1148
 
1145
1149
 
1146
- def florence2_phrase_grounding_image(
1150
+ def florence2_phrase_grounding(
1147
1151
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1148
1152
  ) -> List[Dict[str, Any]]:
1149
- """'florence2_phrase_grounding_image' will run florence2 on a image. It can
1153
+ """'florence2_phrase_grounding' will run florence2 on a image. It can
1150
1154
  detect multiple objects given a text prompt which can be object names or caption.
1151
1155
  You can optionally separate the object names in the text with commas. It returns
1152
1156
  a list of bounding boxes with normalized coordinates, label names and associated
@@ -1167,7 +1171,7 @@ def florence2_phrase_grounding_image(
1167
1171
 
1168
1172
  Example
1169
1173
  -------
1170
- >>> florence2_phrase_grounding_image('person looking at a coyote', image)
1174
+ >>> florence2_phrase_grounding('person looking at a coyote', image)
1171
1175
  [
1172
1176
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1173
1177
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1196,7 +1200,7 @@ def florence2_phrase_grounding_image(
1196
1200
  "florence2-ft",
1197
1201
  v2=True,
1198
1202
  is_form=True,
1199
- metadata_payload={"function_name": "florence2_phrase_grounding_image"},
1203
+ metadata_payload={"function_name": "florence2_phrase_grounding"},
1200
1204
  )
1201
1205
  # get the first frame
1202
1206
  detection = detections[0]
@@ -1205,7 +1209,7 @@ def florence2_phrase_grounding_image(
1205
1209
  "image": image_b64,
1206
1210
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1207
1211
  "prompt": prompt,
1208
- "function_name": "florence2_phrase_grounding_image",
1212
+ "function_name": "florence2_phrase_grounding",
1209
1213
  }
1210
1214
  detections = send_inference_request(data, "florence2", v2=True)
1211
1215
  detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
@@ -2164,8 +2168,7 @@ FUNCTION_TOOLS = [
2164
2168
  florence2_ocr,
2165
2169
  florence2_sam2_image,
2166
2170
  florence2_sam2_video_tracking,
2167
- florence2_phrase_grounding_image,
2168
- florence2_phrase_grounding_video,
2171
+ florence2_phrase_grounding,
2169
2172
  ixc25_image_vqa,
2170
2173
  ixc25_video_vqa,
2171
2174
  detr_segmentation,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.153
3
+ Version: 0.2.155
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
4
4
  vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
5
5
  vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=Ea_v_qLBJMVwQVLLIdNq15MgV2-6qqhcThHAHFwzv-o,18940
8
- vision_agent/agent/vision_agent_prompts.py,sha256=eOqluRb1R_SJFsdWXd9HJuiJnJccEnDDUkfPXlHOjyw,11293
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
11
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
15
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=cg4Axb9L3Z7WkdyEv5IyqDsmZKIrxmS4CmV3DEXURnU,2418
18
- vision_agent/tools/meta_tools.py,sha256=yrplxiDu-L9_Dw_L2ESehJabckAq59Q-xfMpIbYB0Ak,25179
17
+ vision_agent/tools/__init__.py,sha256=u41fm9KGX1s9DWzVAGnuungEooxH4X8fSDk5hjXvDiY,2450
18
+ vision_agent/tools/meta_tools.py,sha256=FN2oMhXzCzSzmk6Na6uKw1r5-CGO3lCk94izcWNFKwA,25167
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
21
- vision_agent/tools/tools.py,sha256=Of7NTZTc1bim_fdAoDxx47WzttGI8VlMKKcId0sMwfk,78406
21
+ vision_agent/tools/tools.py,sha256=3T5h9dewsqkKu66BlNdBwXnEKNCBl0_FhdHwTNYQolI,78471
22
22
  vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
23
23
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
27
27
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
- vision_agent-0.2.153.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.153.dist-info/METADATA,sha256=zehWh4l1EfZeTKxSEgKXtQMb0EE5pvWP1UG0d2lyS44,13758
32
- vision_agent-0.2.153.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.153.dist-info/RECORD,,
30
+ vision_agent-0.2.155.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.155.dist-info/METADATA,sha256=lueDmQRoKz_BUNDRApWHxege_xxXnPI117OBh1nZJcg,13758
32
+ vision_agent-0.2.155.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.155.dist-info/RECORD,,