vision-agent 0.2.180__tar.gz → 0.2.182__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. {vision_agent-0.2.180 → vision_agent-0.2.182}/PKG-INFO +1 -1
  2. {vision_agent-0.2.180 → vision_agent-0.2.182}/pyproject.toml +1 -1
  3. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/vision_agent.py +17 -22
  4. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/tools/tools.py +128 -132
  5. {vision_agent-0.2.180 → vision_agent-0.2.182}/LICENSE +0 -0
  6. {vision_agent-0.2.180 → vision_agent-0.2.182}/README.md +0 -0
  7. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/__init__.py +0 -0
  8. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/__init__.py +0 -0
  9. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/agent.py +0 -0
  10. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/agent_utils.py +0 -0
  11. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_coder.py +0 -0
  12. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  13. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_planner.py +0 -0
  14. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  15. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_prompts.py +0 -0
  16. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/clients/__init__.py +0 -0
  17. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/clients/http.py +0 -0
  18. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/clients/landing_public_api.py +0 -0
  19. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/lmm/__init__.py +0 -0
  22. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/lmm/lmm.py +0 -0
  23. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/lmm/types.py +0 -0
  24. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/tools/__init__.py +0 -0
  25. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/tools/meta_tools.py +0 -0
  26. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/tools/tool_utils.py +0 -0
  28. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/tools/tools_types.py +0 -0
  29. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/__init__.py +0 -0
  30. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/exceptions.py +0 -0
  31. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/execute.py +0 -0
  32. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/image_utils.py +0 -0
  33. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/sim.py +0 -0
  34. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/type_defs.py +0 -0
  35. {vision_agent-0.2.180 → vision_agent-0.2.182}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.180
3
+ Version: 0.2.182
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.180"
7
+ version = "0.2.182"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -492,29 +492,8 @@ class VisionAgent(Agent):
492
492
  code_interpreter.upload_file(artifacts.local_save_path)
493
493
 
494
494
  response = run_conversation(self.agent, int_chat)
495
- code_action = use_extra_vision_agent_args(
496
- response.get("execute_python", None),
497
- test_multi_plan,
498
- custom_tool_names,
499
- )
500
495
  if self.verbosity >= 1:
501
496
  _LOGGER.info(response)
502
- int_chat.append(
503
- {
504
- "role": "assistant",
505
- "content": json.dumps(
506
- new_format_to_old_format(add_step_descriptions(response))
507
- ),
508
- }
509
- )
510
- orig_chat.append(
511
- {
512
- "role": "assistant",
513
- "content": json.dumps(
514
- new_format_to_old_format(add_step_descriptions(response))
515
- ),
516
- }
517
- )
518
497
 
519
498
  code_action = response.get("execute_python", None)
520
499
  # sometimes it gets stuck in a loop, so we force it to exit
@@ -529,7 +508,7 @@ class VisionAgent(Agent):
529
508
  "value": "Agent is stuck in conversation loop, exited",
530
509
  "traceback_raw": [],
531
510
  },
532
- "finished": code_action is None,
511
+ "finished": True,
533
512
  }
534
513
  )
535
514
  else:
@@ -544,6 +523,22 @@ class VisionAgent(Agent):
544
523
  }
545
524
  )
546
525
 
526
+ int_chat.append(
527
+ {
528
+ "role": "assistant",
529
+ "content": json.dumps(
530
+ new_format_to_old_format(add_step_descriptions(response))
531
+ ),
532
+ }
533
+ )
534
+ orig_chat.append(
535
+ {
536
+ "role": "assistant",
537
+ "content": json.dumps(
538
+ new_format_to_old_format(add_step_descriptions(response))
539
+ ),
540
+ }
541
+ )
547
542
  finished = response.get("let_user_respond", False)
548
543
 
549
544
  if code_action is not None:
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import io
3
2
  import json
4
3
  import logging
@@ -184,8 +183,16 @@ def owl_v2_image(
184
183
  if image_size[0] < 1 or image_size[1] < 1:
185
184
  return []
186
185
 
186
+ buffer_bytes = numpy_to_bytes(image)
187
+ files = [("image", buffer_bytes)]
188
+ payload = {
189
+ "prompts": [s.strip() for s in prompt.split(",")],
190
+ "confidence": box_threshold,
191
+ "model": "owlv2",
192
+ }
193
+ metadata = {"function_name": "owl_v2_image"}
194
+
187
195
  if fine_tune_id is not None:
188
- image_b64 = convert_to_b64(image)
189
196
  landing_api = LandingPublicAPI()
190
197
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
191
198
  if status is not JobStatus.SUCCEEDED:
@@ -193,43 +200,22 @@ def owl_v2_image(
193
200
  f"Fine-tuned model {fine_tune_id} is not ready yet"
194
201
  )
195
202
 
196
- data_obj = Florence2FtRequest(
197
- image=image_b64,
198
- task=PromptTask.PHRASE_GROUNDING,
199
- prompt=prompt,
200
- job_id=UUID(fine_tune_id),
201
- )
202
- data = data_obj.model_dump(by_alias=True, exclude_none=True)
203
- detections = send_inference_request(
204
- data,
205
- "florence2-ft",
206
- v2=True,
207
- is_form=True,
208
- metadata_payload={"function_name": "owl_v2_image"},
209
- )
210
- # get the first frame
211
- detection = detections[0]
212
- bboxes_formatted = [
213
- ODResponseData(
214
- label=detection["labels"][i],
215
- bbox=normalize_bbox(detection["bboxes"][i], image_size),
216
- score=1.0,
217
- )
218
- for i in range(len(detection["bboxes"]))
219
- ]
220
- return [bbox.model_dump() for bbox in bboxes_formatted]
203
+ # we can only execute fine-tuned models with florence2
204
+ payload = {
205
+ "prompts": payload["prompts"],
206
+ "jobId": fine_tune_id,
207
+ "model": "florence2",
208
+ }
221
209
 
222
- buffer_bytes = numpy_to_bytes(image)
223
- files = [("image", buffer_bytes)]
224
- payload = {
225
- "prompts": [s.strip() for s in prompt.split(",")],
226
- "model": "owlv2",
227
- "function_name": "owl_v2_image",
228
- }
229
- resp_data = send_inference_request(
230
- payload, "text-to-object-detection", files=files, v2=True
210
+ detections = send_task_inference_request(
211
+ payload,
212
+ "text-to-object-detection",
213
+ files=files,
214
+ metadata=metadata,
231
215
  )
232
- bboxes = resp_data[0]
216
+
217
+ # get the first frame
218
+ bboxes = detections[0]
233
219
  bboxes_formatted = [
234
220
  ODResponseData(
235
221
  label=bbox["label"],
@@ -238,17 +224,17 @@ def owl_v2_image(
238
224
  )
239
225
  for bbox in bboxes
240
226
  ]
241
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
242
- return [bbox.model_dump() for bbox in filtered_bboxes]
227
+ return [bbox.model_dump() for bbox in bboxes_formatted]
243
228
 
244
229
 
245
230
  def owl_v2_video(
246
231
  prompt: str,
247
232
  frames: List[np.ndarray],
248
233
  box_threshold: float = 0.10,
234
+ fine_tune_id: Optional[str] = None,
249
235
  ) -> List[List[Dict[str, Any]]]:
250
236
  """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
251
- objects indepdently per frame given a text prompt such as a category name or
237
+ objects independently per frame given a text prompt such as a category name or
252
238
  referring expression but does not track objects across frames. The categories in
253
239
  text prompt are separated by commas. It returns a list of lists where each inner
254
240
  list contains the score, label, and bounding box of the detections for that frame.
@@ -258,6 +244,8 @@ def owl_v2_video(
258
244
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
259
245
  box_threshold (float, optional): The threshold for the box detection. Defaults
260
246
  to 0.30.
247
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
248
+ fine-tuned model ID here to use it.
261
249
 
262
250
  Returns:
263
251
  List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
@@ -285,30 +273,45 @@ def owl_v2_video(
285
273
  files = [("video", buffer_bytes)]
286
274
  payload = {
287
275
  "prompts": [s.strip() for s in prompt.split(",")],
276
+ "confidence": box_threshold,
288
277
  "model": "owlv2",
289
- "function_name": "owl_v2_video",
290
278
  }
291
- data: Dict[str, Any] = send_inference_request(
292
- payload, "text-to-object-detection", files=files, v2=True
279
+ metadata = {"function_name": "owl_v2_video"}
280
+
281
+ if fine_tune_id is not None:
282
+ landing_api = LandingPublicAPI()
283
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
284
+ if status is not JobStatus.SUCCEEDED:
285
+ raise FineTuneModelIsNotReady(
286
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
287
+ )
288
+
289
+ # we can only execute fine-tuned models with florence2
290
+ payload = {
291
+ "prompts": payload["prompts"],
292
+ "jobId": fine_tune_id,
293
+ "model": "florence2",
294
+ }
295
+
296
+ detections = send_task_inference_request(
297
+ payload,
298
+ "text-to-object-detection",
299
+ files=files,
300
+ metadata=metadata,
293
301
  )
294
- bboxes_formatted = []
295
- if data is not None:
296
- for frame_data in data:
297
- bboxes_formated_frame = []
298
- for elt in frame_data:
299
- bboxes_formated_frame.append(
300
- ODResponseData(
301
- label=elt["label"], # type: ignore
302
- bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
303
- score=round(elt["score"], 2), # type: ignore
304
- )
305
- )
306
- bboxes_formatted.append(bboxes_formated_frame)
307
302
 
308
- filtered_bboxes = [
309
- filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
310
- ]
311
- return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
303
+ bboxes_formatted = []
304
+ for frame_data in detections:
305
+ bboxes_formatted_per_frame = [
306
+ ODResponseData(
307
+ label=bbox["label"],
308
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
309
+ score=round(bbox["score"], 2),
310
+ )
311
+ for bbox in frame_data
312
+ ]
313
+ bboxes_formatted.append(bboxes_formatted_per_frame)
314
+ return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
312
315
 
313
316
 
314
317
  def grounding_sam(
@@ -708,23 +711,31 @@ def countgd_counting(
708
711
  image_size = image.shape[:2]
709
712
  if image_size[0] < 1 or image_size[1] < 1:
710
713
  return []
714
+
711
715
  buffer_bytes = numpy_to_bytes(image)
712
716
  files = [("image", buffer_bytes)]
713
- prompt = prompt.replace(", ", " .")
714
- payload = {"prompts": [prompt], "model": "countgd"}
717
+ payload = {
718
+ "prompts": [prompt.replace(", ", " .")],
719
+ "confidence": box_threshold, # still not being used in the API
720
+ "model": "countgd",
721
+ }
715
722
  metadata = {"function_name": "countgd_counting"}
716
- resp_data = send_task_inference_request(
723
+
724
+ detections = send_task_inference_request(
717
725
  payload, "text-to-object-detection", files=files, metadata=metadata
718
726
  )
719
- bboxes_per_frame = resp_data[0]
727
+
728
+ # get the first frame
729
+ bboxes = detections[0]
720
730
  bboxes_formatted = [
721
731
  ODResponseData(
722
732
  label=bbox["label"],
723
733
  bbox=normalize_bbox(bbox["bounding_box"], image_size),
724
734
  score=round(bbox["score"], 2),
725
735
  )
726
- for bbox in bboxes_per_frame
736
+ for bbox in bboxes
727
737
  ]
738
+ # TODO: remove this once we start to use the confidence on countgd
728
739
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
729
740
  return [bbox.model_dump() for bbox in filtered_bboxes]
730
741
 
@@ -768,6 +779,7 @@ def countgd_example_based_counting(
768
779
  image_size = image.shape[:2]
769
780
  if image_size[0] < 1 or image_size[1] < 1:
770
781
  return []
782
+
771
783
  buffer_bytes = numpy_to_bytes(image)
772
784
  files = [("image", buffer_bytes)]
773
785
  visual_prompts = [
@@ -775,10 +787,13 @@ def countgd_example_based_counting(
775
787
  ]
776
788
  payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
777
789
  metadata = {"function_name": "countgd_example_based_counting"}
778
- resp_data = send_task_inference_request(
790
+
791
+ detections = send_task_inference_request(
779
792
  payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
780
793
  )
781
- bboxes_per_frame = resp_data[0]
794
+
795
+ # get the first frame
796
+ bboxes_per_frame = detections[0]
782
797
  bboxes_formatted = [
783
798
  ODResponseData(
784
799
  label=bbox["label"],
@@ -1240,7 +1255,14 @@ def florence2_phrase_grounding(
1240
1255
  image_size = image.shape[:2]
1241
1256
  if image_size[0] < 1 or image_size[1] < 1:
1242
1257
  return []
1243
- image_b64 = convert_to_b64(image)
1258
+
1259
+ buffer_bytes = numpy_to_bytes(image)
1260
+ files = [("image", buffer_bytes)]
1261
+ payload = {
1262
+ "prompts": [s.strip() for s in prompt.split(",")],
1263
+ "model": "florence2",
1264
+ }
1265
+ metadata = {"function_name": "florence2_phrase_grounding"}
1244
1266
 
1245
1267
  if fine_tune_id is not None:
1246
1268
  landing_api = LandingPublicAPI()
@@ -1250,42 +1272,27 @@ def florence2_phrase_grounding(
1250
1272
  f"Fine-tuned model {fine_tune_id} is not ready yet"
1251
1273
  )
1252
1274
 
1253
- data_obj = Florence2FtRequest(
1254
- image=image_b64,
1255
- task=PromptTask.PHRASE_GROUNDING,
1256
- prompt=prompt,
1257
- job_id=UUID(fine_tune_id),
1258
- )
1259
- data = data_obj.model_dump(by_alias=True, exclude_none=True)
1260
- detections = send_inference_request(
1261
- data,
1262
- "florence2-ft",
1263
- v2=True,
1264
- is_form=True,
1265
- metadata_payload={"function_name": "florence2_phrase_grounding"},
1266
- )
1267
- # get the first frame
1268
- detection = detections[0]
1269
- else:
1270
- data = {
1271
- "image": image_b64,
1272
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1273
- "prompt": prompt,
1274
- "function_name": "florence2_phrase_grounding",
1275
- }
1276
- detections = send_inference_request(data, "florence2", v2=True)
1277
- detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1275
+ payload["jobId"] = fine_tune_id
1278
1276
 
1279
- return_data = []
1280
- for i in range(len(detection["bboxes"])):
1281
- return_data.append(
1282
- ODResponseData(
1283
- label=detection["labels"][i],
1284
- bbox=normalize_bbox(detection["bboxes"][i], image_size),
1285
- score=1.0,
1286
- )
1277
+ detections = send_task_inference_request(
1278
+ payload,
1279
+ "text-to-object-detection",
1280
+ files=files,
1281
+ metadata=metadata,
1282
+ )
1283
+
1284
+ # get the first frame
1285
+ bboxes = detections[0]
1286
+ bboxes_formatted = [
1287
+ ODResponseData(
1288
+ label=bbox["label"],
1289
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
1290
+ score=round(bbox["score"], 2),
1287
1291
  )
1288
- return [bbox.model_dump() for bbox in return_data]
1292
+ for bbox in bboxes
1293
+ ]
1294
+
1295
+ return [bbox.model_dump() for bbox in bboxes_formatted]
1289
1296
 
1290
1297
 
1291
1298
  def florence2_phrase_grounding_video(
@@ -1327,6 +1334,11 @@ def florence2_phrase_grounding_video(
1327
1334
  image_size = frames[0].shape[:2]
1328
1335
  buffer_bytes = frames_to_bytes(frames)
1329
1336
  files = [("video", buffer_bytes)]
1337
+ payload = {
1338
+ "prompts": [s.strip() for s in prompt.split(",")],
1339
+ "model": "florence2",
1340
+ }
1341
+ metadata = {"function_name": "florence2_phrase_grounding_video"}
1330
1342
 
1331
1343
  if fine_tune_id is not None:
1332
1344
  landing_api = LandingPublicAPI()
@@ -1336,41 +1348,25 @@ def florence2_phrase_grounding_video(
1336
1348
  f"Fine-tuned model {fine_tune_id} is not ready yet"
1337
1349
  )
1338
1350
 
1339
- data_obj = Florence2FtRequest(
1340
- task=PromptTask.PHRASE_GROUNDING,
1341
- prompt=prompt,
1342
- job_id=UUID(fine_tune_id),
1343
- )
1351
+ payload["jobId"] = fine_tune_id
1344
1352
 
1345
- data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
1346
- detections = send_inference_request(
1347
- data,
1348
- "florence2-ft",
1349
- v2=True,
1350
- files=files,
1351
- metadata_payload={"function_name": "florence2_phrase_grounding_video"},
1352
- )
1353
- else:
1354
- data = {
1355
- "prompt": prompt,
1356
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1357
- "function_name": "florence2_phrase_grounding_video",
1358
- "video": base64.b64encode(buffer_bytes).decode("utf-8"),
1359
- }
1360
- detections = send_inference_request(data, "florence2", v2=True)
1361
- detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
1353
+ detections = send_task_inference_request(
1354
+ payload,
1355
+ "text-to-object-detection",
1356
+ files=files,
1357
+ metadata=metadata,
1358
+ )
1362
1359
 
1363
1360
  bboxes_formatted = []
1364
1361
  for frame_data in detections:
1365
- bboxes_formatted_per_frame = []
1366
- for idx in range(len(frame_data["bboxes"])):
1367
- bboxes_formatted_per_frame.append(
1368
- ODResponseData(
1369
- label=frame_data["labels"][idx],
1370
- bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
1371
- score=1.0,
1372
- )
1362
+ bboxes_formatted_per_frame = [
1363
+ ODResponseData(
1364
+ label=bbox["label"],
1365
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
1366
+ score=round(bbox["score"], 2),
1373
1367
  )
1368
+ for bbox in frame_data
1369
+ ]
1374
1370
  bboxes_formatted.append(bboxes_formatted_per_frame)
1375
1371
  return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1376
1372
 
File without changes
File without changes