vision-agent 0.2.180__py3-none-any.whl → 0.2.182__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -492,29 +492,8 @@ class VisionAgent(Agent):
492
492
  code_interpreter.upload_file(artifacts.local_save_path)
493
493
 
494
494
  response = run_conversation(self.agent, int_chat)
495
- code_action = use_extra_vision_agent_args(
496
- response.get("execute_python", None),
497
- test_multi_plan,
498
- custom_tool_names,
499
- )
500
495
  if self.verbosity >= 1:
501
496
  _LOGGER.info(response)
502
- int_chat.append(
503
- {
504
- "role": "assistant",
505
- "content": json.dumps(
506
- new_format_to_old_format(add_step_descriptions(response))
507
- ),
508
- }
509
- )
510
- orig_chat.append(
511
- {
512
- "role": "assistant",
513
- "content": json.dumps(
514
- new_format_to_old_format(add_step_descriptions(response))
515
- ),
516
- }
517
- )
518
497
 
519
498
  code_action = response.get("execute_python", None)
520
499
  # sometimes it gets stuck in a loop, so we force it to exit
@@ -529,7 +508,7 @@ class VisionAgent(Agent):
529
508
  "value": "Agent is stuck in conversation loop, exited",
530
509
  "traceback_raw": [],
531
510
  },
532
- "finished": code_action is None,
511
+ "finished": True,
533
512
  }
534
513
  )
535
514
  else:
@@ -544,6 +523,22 @@ class VisionAgent(Agent):
544
523
  }
545
524
  )
546
525
 
526
+ int_chat.append(
527
+ {
528
+ "role": "assistant",
529
+ "content": json.dumps(
530
+ new_format_to_old_format(add_step_descriptions(response))
531
+ ),
532
+ }
533
+ )
534
+ orig_chat.append(
535
+ {
536
+ "role": "assistant",
537
+ "content": json.dumps(
538
+ new_format_to_old_format(add_step_descriptions(response))
539
+ ),
540
+ }
541
+ )
547
542
  finished = response.get("let_user_respond", False)
548
543
 
549
544
  if code_action is not None:
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import io
3
2
  import json
4
3
  import logging
@@ -184,8 +183,16 @@ def owl_v2_image(
184
183
  if image_size[0] < 1 or image_size[1] < 1:
185
184
  return []
186
185
 
186
+ buffer_bytes = numpy_to_bytes(image)
187
+ files = [("image", buffer_bytes)]
188
+ payload = {
189
+ "prompts": [s.strip() for s in prompt.split(",")],
190
+ "confidence": box_threshold,
191
+ "model": "owlv2",
192
+ }
193
+ metadata = {"function_name": "owl_v2_image"}
194
+
187
195
  if fine_tune_id is not None:
188
- image_b64 = convert_to_b64(image)
189
196
  landing_api = LandingPublicAPI()
190
197
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
191
198
  if status is not JobStatus.SUCCEEDED:
@@ -193,43 +200,22 @@ def owl_v2_image(
193
200
  f"Fine-tuned model {fine_tune_id} is not ready yet"
194
201
  )
195
202
 
196
- data_obj = Florence2FtRequest(
197
- image=image_b64,
198
- task=PromptTask.PHRASE_GROUNDING,
199
- prompt=prompt,
200
- job_id=UUID(fine_tune_id),
201
- )
202
- data = data_obj.model_dump(by_alias=True, exclude_none=True)
203
- detections = send_inference_request(
204
- data,
205
- "florence2-ft",
206
- v2=True,
207
- is_form=True,
208
- metadata_payload={"function_name": "owl_v2_image"},
209
- )
210
- # get the first frame
211
- detection = detections[0]
212
- bboxes_formatted = [
213
- ODResponseData(
214
- label=detection["labels"][i],
215
- bbox=normalize_bbox(detection["bboxes"][i], image_size),
216
- score=1.0,
217
- )
218
- for i in range(len(detection["bboxes"]))
219
- ]
220
- return [bbox.model_dump() for bbox in bboxes_formatted]
203
+ # we can only execute fine-tuned models with florence2
204
+ payload = {
205
+ "prompts": payload["prompts"],
206
+ "jobId": fine_tune_id,
207
+ "model": "florence2",
208
+ }
221
209
 
222
- buffer_bytes = numpy_to_bytes(image)
223
- files = [("image", buffer_bytes)]
224
- payload = {
225
- "prompts": [s.strip() for s in prompt.split(",")],
226
- "model": "owlv2",
227
- "function_name": "owl_v2_image",
228
- }
229
- resp_data = send_inference_request(
230
- payload, "text-to-object-detection", files=files, v2=True
210
+ detections = send_task_inference_request(
211
+ payload,
212
+ "text-to-object-detection",
213
+ files=files,
214
+ metadata=metadata,
231
215
  )
232
- bboxes = resp_data[0]
216
+
217
+ # get the first frame
218
+ bboxes = detections[0]
233
219
  bboxes_formatted = [
234
220
  ODResponseData(
235
221
  label=bbox["label"],
@@ -238,17 +224,17 @@ def owl_v2_image(
238
224
  )
239
225
  for bbox in bboxes
240
226
  ]
241
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
242
- return [bbox.model_dump() for bbox in filtered_bboxes]
227
+ return [bbox.model_dump() for bbox in bboxes_formatted]
243
228
 
244
229
 
245
230
  def owl_v2_video(
246
231
  prompt: str,
247
232
  frames: List[np.ndarray],
248
233
  box_threshold: float = 0.10,
234
+ fine_tune_id: Optional[str] = None,
249
235
  ) -> List[List[Dict[str, Any]]]:
250
236
  """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
251
- objects indepdently per frame given a text prompt such as a category name or
237
+ objects independently per frame given a text prompt such as a category name or
252
238
  referring expression but does not track objects across frames. The categories in
253
239
  text prompt are separated by commas. It returns a list of lists where each inner
254
240
  list contains the score, label, and bounding box of the detections for that frame.
@@ -258,6 +244,8 @@ def owl_v2_video(
258
244
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
259
245
  box_threshold (float, optional): The threshold for the box detection. Defaults
260
246
  to 0.30.
247
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
248
+ fine-tuned model ID here to use it.
261
249
 
262
250
  Returns:
263
251
  List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
@@ -285,30 +273,45 @@ def owl_v2_video(
285
273
  files = [("video", buffer_bytes)]
286
274
  payload = {
287
275
  "prompts": [s.strip() for s in prompt.split(",")],
276
+ "confidence": box_threshold,
288
277
  "model": "owlv2",
289
- "function_name": "owl_v2_video",
290
278
  }
291
- data: Dict[str, Any] = send_inference_request(
292
- payload, "text-to-object-detection", files=files, v2=True
279
+ metadata = {"function_name": "owl_v2_video"}
280
+
281
+ if fine_tune_id is not None:
282
+ landing_api = LandingPublicAPI()
283
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
284
+ if status is not JobStatus.SUCCEEDED:
285
+ raise FineTuneModelIsNotReady(
286
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
287
+ )
288
+
289
+ # we can only execute fine-tuned models with florence2
290
+ payload = {
291
+ "prompts": payload["prompts"],
292
+ "jobId": fine_tune_id,
293
+ "model": "florence2",
294
+ }
295
+
296
+ detections = send_task_inference_request(
297
+ payload,
298
+ "text-to-object-detection",
299
+ files=files,
300
+ metadata=metadata,
293
301
  )
294
- bboxes_formatted = []
295
- if data is not None:
296
- for frame_data in data:
297
- bboxes_formated_frame = []
298
- for elt in frame_data:
299
- bboxes_formated_frame.append(
300
- ODResponseData(
301
- label=elt["label"], # type: ignore
302
- bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
303
- score=round(elt["score"], 2), # type: ignore
304
- )
305
- )
306
- bboxes_formatted.append(bboxes_formated_frame)
307
302
 
308
- filtered_bboxes = [
309
- filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
310
- ]
311
- return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
303
+ bboxes_formatted = []
304
+ for frame_data in detections:
305
+ bboxes_formatted_per_frame = [
306
+ ODResponseData(
307
+ label=bbox["label"],
308
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
309
+ score=round(bbox["score"], 2),
310
+ )
311
+ for bbox in frame_data
312
+ ]
313
+ bboxes_formatted.append(bboxes_formatted_per_frame)
314
+ return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
312
315
 
313
316
 
314
317
  def grounding_sam(
@@ -708,23 +711,31 @@ def countgd_counting(
708
711
  image_size = image.shape[:2]
709
712
  if image_size[0] < 1 or image_size[1] < 1:
710
713
  return []
714
+
711
715
  buffer_bytes = numpy_to_bytes(image)
712
716
  files = [("image", buffer_bytes)]
713
- prompt = prompt.replace(", ", " .")
714
- payload = {"prompts": [prompt], "model": "countgd"}
717
+ payload = {
718
+ "prompts": [prompt.replace(", ", " .")],
719
+ "confidence": box_threshold, # still not being used in the API
720
+ "model": "countgd",
721
+ }
715
722
  metadata = {"function_name": "countgd_counting"}
716
- resp_data = send_task_inference_request(
723
+
724
+ detections = send_task_inference_request(
717
725
  payload, "text-to-object-detection", files=files, metadata=metadata
718
726
  )
719
- bboxes_per_frame = resp_data[0]
727
+
728
+ # get the first frame
729
+ bboxes = detections[0]
720
730
  bboxes_formatted = [
721
731
  ODResponseData(
722
732
  label=bbox["label"],
723
733
  bbox=normalize_bbox(bbox["bounding_box"], image_size),
724
734
  score=round(bbox["score"], 2),
725
735
  )
726
- for bbox in bboxes_per_frame
736
+ for bbox in bboxes
727
737
  ]
738
+ # TODO: remove this once we start to use the confidence on countgd
728
739
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
729
740
  return [bbox.model_dump() for bbox in filtered_bboxes]
730
741
 
@@ -768,6 +779,7 @@ def countgd_example_based_counting(
768
779
  image_size = image.shape[:2]
769
780
  if image_size[0] < 1 or image_size[1] < 1:
770
781
  return []
782
+
771
783
  buffer_bytes = numpy_to_bytes(image)
772
784
  files = [("image", buffer_bytes)]
773
785
  visual_prompts = [
@@ -775,10 +787,13 @@ def countgd_example_based_counting(
775
787
  ]
776
788
  payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
777
789
  metadata = {"function_name": "countgd_example_based_counting"}
778
- resp_data = send_task_inference_request(
790
+
791
+ detections = send_task_inference_request(
779
792
  payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
780
793
  )
781
- bboxes_per_frame = resp_data[0]
794
+
795
+ # get the first frame
796
+ bboxes_per_frame = detections[0]
782
797
  bboxes_formatted = [
783
798
  ODResponseData(
784
799
  label=bbox["label"],
@@ -1240,7 +1255,14 @@ def florence2_phrase_grounding(
1240
1255
  image_size = image.shape[:2]
1241
1256
  if image_size[0] < 1 or image_size[1] < 1:
1242
1257
  return []
1243
- image_b64 = convert_to_b64(image)
1258
+
1259
+ buffer_bytes = numpy_to_bytes(image)
1260
+ files = [("image", buffer_bytes)]
1261
+ payload = {
1262
+ "prompts": [s.strip() for s in prompt.split(",")],
1263
+ "model": "florence2",
1264
+ }
1265
+ metadata = {"function_name": "florence2_phrase_grounding"}
1244
1266
 
1245
1267
  if fine_tune_id is not None:
1246
1268
  landing_api = LandingPublicAPI()
@@ -1250,42 +1272,27 @@ def florence2_phrase_grounding(
1250
1272
  f"Fine-tuned model {fine_tune_id} is not ready yet"
1251
1273
  )
1252
1274
 
1253
- data_obj = Florence2FtRequest(
1254
- image=image_b64,
1255
- task=PromptTask.PHRASE_GROUNDING,
1256
- prompt=prompt,
1257
- job_id=UUID(fine_tune_id),
1258
- )
1259
- data = data_obj.model_dump(by_alias=True, exclude_none=True)
1260
- detections = send_inference_request(
1261
- data,
1262
- "florence2-ft",
1263
- v2=True,
1264
- is_form=True,
1265
- metadata_payload={"function_name": "florence2_phrase_grounding"},
1266
- )
1267
- # get the first frame
1268
- detection = detections[0]
1269
- else:
1270
- data = {
1271
- "image": image_b64,
1272
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1273
- "prompt": prompt,
1274
- "function_name": "florence2_phrase_grounding",
1275
- }
1276
- detections = send_inference_request(data, "florence2", v2=True)
1277
- detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1275
+ payload["jobId"] = fine_tune_id
1278
1276
 
1279
- return_data = []
1280
- for i in range(len(detection["bboxes"])):
1281
- return_data.append(
1282
- ODResponseData(
1283
- label=detection["labels"][i],
1284
- bbox=normalize_bbox(detection["bboxes"][i], image_size),
1285
- score=1.0,
1286
- )
1277
+ detections = send_task_inference_request(
1278
+ payload,
1279
+ "text-to-object-detection",
1280
+ files=files,
1281
+ metadata=metadata,
1282
+ )
1283
+
1284
+ # get the first frame
1285
+ bboxes = detections[0]
1286
+ bboxes_formatted = [
1287
+ ODResponseData(
1288
+ label=bbox["label"],
1289
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
1290
+ score=round(bbox["score"], 2),
1287
1291
  )
1288
- return [bbox.model_dump() for bbox in return_data]
1292
+ for bbox in bboxes
1293
+ ]
1294
+
1295
+ return [bbox.model_dump() for bbox in bboxes_formatted]
1289
1296
 
1290
1297
 
1291
1298
  def florence2_phrase_grounding_video(
@@ -1327,6 +1334,11 @@ def florence2_phrase_grounding_video(
1327
1334
  image_size = frames[0].shape[:2]
1328
1335
  buffer_bytes = frames_to_bytes(frames)
1329
1336
  files = [("video", buffer_bytes)]
1337
+ payload = {
1338
+ "prompts": [s.strip() for s in prompt.split(",")],
1339
+ "model": "florence2",
1340
+ }
1341
+ metadata = {"function_name": "florence2_phrase_grounding_video"}
1330
1342
 
1331
1343
  if fine_tune_id is not None:
1332
1344
  landing_api = LandingPublicAPI()
@@ -1336,41 +1348,25 @@ def florence2_phrase_grounding_video(
1336
1348
  f"Fine-tuned model {fine_tune_id} is not ready yet"
1337
1349
  )
1338
1350
 
1339
- data_obj = Florence2FtRequest(
1340
- task=PromptTask.PHRASE_GROUNDING,
1341
- prompt=prompt,
1342
- job_id=UUID(fine_tune_id),
1343
- )
1351
+ payload["jobId"] = fine_tune_id
1344
1352
 
1345
- data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
1346
- detections = send_inference_request(
1347
- data,
1348
- "florence2-ft",
1349
- v2=True,
1350
- files=files,
1351
- metadata_payload={"function_name": "florence2_phrase_grounding_video"},
1352
- )
1353
- else:
1354
- data = {
1355
- "prompt": prompt,
1356
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1357
- "function_name": "florence2_phrase_grounding_video",
1358
- "video": base64.b64encode(buffer_bytes).decode("utf-8"),
1359
- }
1360
- detections = send_inference_request(data, "florence2", v2=True)
1361
- detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
1353
+ detections = send_task_inference_request(
1354
+ payload,
1355
+ "text-to-object-detection",
1356
+ files=files,
1357
+ metadata=metadata,
1358
+ )
1362
1359
 
1363
1360
  bboxes_formatted = []
1364
1361
  for frame_data in detections:
1365
- bboxes_formatted_per_frame = []
1366
- for idx in range(len(frame_data["bboxes"])):
1367
- bboxes_formatted_per_frame.append(
1368
- ODResponseData(
1369
- label=frame_data["labels"][idx],
1370
- bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
1371
- score=1.0,
1372
- )
1362
+ bboxes_formatted_per_frame = [
1363
+ ODResponseData(
1364
+ label=bbox["label"],
1365
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
1366
+ score=round(bbox["score"], 2),
1373
1367
  )
1368
+ for bbox in frame_data
1369
+ ]
1374
1370
  bboxes_formatted.append(bboxes_formatted_per_frame)
1375
1371
  return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1376
1372
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.180
3
+ Version: 0.2.182
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
5
- vision_agent/agent/vision_agent.py,sha256=x0-TElnTRW7abyq2wAwKRiTUExBGg24C-c74wO1oKtI,26336
5
+ vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
8
8
  vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdV
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=9MbX3b_xff-cHeCh46_q6gt7b5jNSCVSwiu2rwM43Ws,81224
23
+ vision_agent/tools/tools.py,sha256=p0MBQnwA10NF48ZhTIRWzHaarkezjvDazk7VuvjH1-k,80142
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.180.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.180.dist-info/METADATA,sha256=KHeuZn1H6KJXyMlkPyrmie_AqUL1MMALOIoU0kKzg2s,18330
34
- vision_agent-0.2.180.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.180.dist-info/RECORD,,
32
+ vision_agent-0.2.182.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.182.dist-info/METADATA,sha256=eLwHRDYfkonJsLN0ug1Sc2bqZv7SAHiDzVeYeTGCmj8,18330
34
+ vision_agent-0.2.182.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.182.dist-info/RECORD,,