vision-agent 0.2.151__py3-none-any.whl → 0.2.153__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -101,7 +101,7 @@ plan1:
101
101
  - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
102
102
  plan2:
103
103
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
104
- - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
104
+ - Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
105
105
  plan3:
106
106
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
107
107
  - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
109
109
 
110
110
  ```python
111
111
  import numpy as np
112
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
112
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
113
113
 
114
114
  # sample at 1 FPS and use the first 10 frames to reduce processing time
115
115
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
143
143
  owl_v2_counts = get_counts(owl_v2_out)
144
144
 
145
145
  # plan2
146
- florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
146
+ florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
147
147
  florence2_counts = get_counts(florence2_out)
148
148
 
149
149
  # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
153
153
 
154
154
  final_out = {{
155
155
  "owl_v2_video": owl_v2_out,
156
- "florence2_phrase_grounding": florence2_out,
156
+ "florence2_phrase_grounding_image": florence2_out,
157
157
  "florence2_sam2_video_tracking": f2s2_out,
158
158
  }}
159
159
 
160
160
  counts = {{
161
161
  "owl_v2_video": owl_v2_counts,
162
- "florence2_phrase_grounding": florence2_counts,
162
+ "florence2_phrase_grounding_image": florence2_counts,
163
163
  "florence2_sam2_video_tracking": f2s2_counts,
164
164
  }}
165
165
 
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
131
131
 
132
132
  OBSERVATION:
133
133
  [Artifact code.py]
134
- 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
134
+ 0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
135
135
  1|def count_workers_with_helmets(image_path: str, output_path: str):
136
136
  2| image = load_image(image_path)
137
- 3| detections = florence2_phrase_grounding("worker, helmet", image)
137
+ 3| detections = florence2_phrase_grounding_image("worker, helmet", image)
138
138
  4| workers = [d for d in detections if d['label'] == 'worker']
139
139
  5| helmets = [d for d in detections if d['label'] == 'helmet']
140
140
  6| count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
166
166
  OBSERVATION:
167
167
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
168
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
169
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
170
 
171
171
  OBSERVATION:
172
172
  [Artifact code.py edits]
173
173
  ---
174
174
  +++
175
175
  @@ -1,7 +1,7 @@
176
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
176
+ from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
177
177
  def count_workers_with_helmets(image_path: str, output_path: str):
178
178
  image = load_image(image_path)
179
- - detections = florence2_phrase_grounding("worker, helmet", image)
180
- + detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
179
+ - detections = florence2_phrase_grounding_image("worker, helmet", image)
180
+ + detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
181
181
  workers = [d for d in detections if d['label'] == 'worker']
182
182
  helmets = [d for d in detections if d['label'] == 'helmet']
183
183
  count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
189
189
  ----- stdout -----
190
190
  3
191
191
 
192
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
192
+ AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
193
193
  """
@@ -24,7 +24,7 @@ from .tools import (
24
24
  extract_frames_and_timestamps,
25
25
  florence2_image_caption,
26
26
  florence2_ocr,
27
- florence2_phrase_grounding,
27
+ florence2_phrase_grounding_image,
28
28
  florence2_roberta_vqa,
29
29
  florence2_sam2_image,
30
30
  florence2_sam2_video_tracking,
@@ -668,8 +668,12 @@ def use_object_detection_fine_tuning(
668
668
 
669
669
  patterns_with_fine_tune_id = [
670
670
  (
671
- r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
- lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
671
+ r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
+ lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
+ ),
674
+ (
675
+ r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
676
+ lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
677
  ),
674
678
  (
675
679
  r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
@@ -1,6 +1,6 @@
1
+ import os
1
2
  import inspect
2
3
  import logging
3
- import os
4
4
  from base64 import b64encode
5
5
  from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
6
6
 
@@ -37,8 +37,9 @@ def send_inference_request(
37
37
  files: Optional[List[Tuple[Any, ...]]] = None,
38
38
  v2: bool = False,
39
39
  metadata_payload: Optional[Dict[str, Any]] = None,
40
+ is_form: bool = False,
40
41
  ) -> Any:
41
- # TODO: runtime_tag and function_name should be metadata_payload and now included
42
+ # TODO: runtime_tag and function_name should be metadata_payload and not included
42
43
  # in the service payload
43
44
  if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
44
45
  payload["runtime_tag"] = runtime_tag
@@ -64,7 +65,7 @@ def send_inference_request(
64
65
  elif metadata_payload is not None and "function_name" in metadata_payload:
65
66
  function_name = metadata_payload["function_name"]
66
67
 
67
- response = _call_post(url, payload, session, files, function_name)
68
+ response = _call_post(url, payload, session, files, function_name, is_form)
68
69
 
69
70
  # TODO: consider making the response schema the same between below two sources
70
71
  return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
@@ -75,6 +76,7 @@ def send_task_inference_request(
75
76
  task_name: str,
76
77
  files: Optional[List[Tuple[Any, ...]]] = None,
77
78
  metadata: Optional[Dict[str, Any]] = None,
79
+ is_form: bool = False,
78
80
  ) -> Any:
79
81
  url = f"{_LND_API_URL_v2}/{task_name}"
80
82
  headers = {"apikey": _LND_API_KEY}
@@ -87,7 +89,7 @@ def send_task_inference_request(
87
89
  function_name = "unknown"
88
90
  if metadata is not None and "function_name" in metadata:
89
91
  function_name = metadata["function_name"]
90
- response = _call_post(url, payload, session, files, function_name)
92
+ response = _call_post(url, payload, session, files, function_name, is_form)
91
93
  return response["data"]
92
94
 
93
95
 
@@ -203,6 +205,7 @@ def _call_post(
203
205
  session: Session,
204
206
  files: Optional[List[Tuple[Any, ...]]] = None,
205
207
  function_name: str = "unknown",
208
+ is_form: bool = False,
206
209
  ) -> Any:
207
210
  files_in_b64 = None
208
211
  if files:
@@ -210,6 +213,8 @@ def _call_post(
210
213
  try:
211
214
  if files is not None:
212
215
  response = session.post(url, data=payload, files=files)
216
+ elif is_form:
217
+ response = session.post(url, data=payload)
213
218
  else:
214
219
  response = session.post(url, json=payload)
215
220
 
@@ -1,3 +1,4 @@
1
+ import base64
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -28,7 +29,6 @@ from vision_agent.tools.tool_utils import (
28
29
  send_task_inference_request,
29
30
  )
30
31
  from vision_agent.tools.tools_types import (
31
- FineTuning,
32
32
  Florence2FtRequest,
33
33
  JobStatus,
34
34
  ODResponseData,
@@ -194,20 +194,26 @@ def owl_v2_image(
194
194
  data_obj = Florence2FtRequest(
195
195
  image=image_b64,
196
196
  task=PromptTask.PHRASE_GROUNDING,
197
- tool="florencev2_fine_tuning",
198
197
  prompt=prompt,
199
- fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
198
+ job_id=UUID(fine_tune_id),
200
199
  )
201
- data = data_obj.model_dump(by_alias=True)
202
- detections = send_inference_request(data, "tools", v2=False)
203
- detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
200
+ data = data_obj.model_dump(by_alias=True, exclude_none=True)
201
+ detections = send_inference_request(
202
+ data,
203
+ "florence2-ft",
204
+ v2=True,
205
+ is_form=True,
206
+ metadata_payload={"function_name": "owl_v2_image"},
207
+ )
208
+ # get the first frame
209
+ detection = detections[0]
204
210
  bboxes_formatted = [
205
211
  ODResponseData(
206
- label=detections["labels"][i],
207
- bbox=normalize_bbox(detections["bboxes"][i], image_size),
212
+ label=detection["labels"][i],
213
+ bbox=normalize_bbox(detection["bboxes"][i], image_size),
208
214
  score=1.0,
209
215
  )
210
- for i in range(len(detections["bboxes"]))
216
+ for i in range(len(detection["bboxes"]))
211
217
  ]
212
218
  return [bbox.model_dump() for bbox in bboxes_formatted]
213
219
 
@@ -419,25 +425,30 @@ def florence2_sam2_image(
419
425
  req_data_obj = Florence2FtRequest(
420
426
  image=image_b64,
421
427
  task=PromptTask.PHRASE_GROUNDING,
422
- tool="florencev2_fine_tuning",
423
428
  prompt=prompt,
424
- fine_tuning=FineTuning(
425
- job_id=UUID(fine_tune_id),
426
- postprocessing="sam2",
427
- ),
429
+ postprocessing="sam2",
430
+ job_id=UUID(fine_tune_id),
431
+ )
432
+ req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
433
+ detections_ft = send_inference_request(
434
+ req_data,
435
+ "florence2-ft",
436
+ v2=True,
437
+ is_form=True,
438
+ metadata_payload={"function_name": "florence2_sam2_image"},
428
439
  )
429
- req_data = req_data_obj.model_dump(by_alias=True)
430
- detections_ft = send_inference_request(req_data, "tools", v2=False)
431
- detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
440
+ # get the first frame
441
+ detection = detections_ft[0]
432
442
  return_data = []
433
- all_masks = np.array(detections_ft["masks"])
434
- for i in range(len(detections_ft["bboxes"])):
443
+ for i in range(len(detection["bboxes"])):
435
444
  return_data.append(
436
445
  {
437
446
  "score": 1.0,
438
- "label": detections_ft["labels"][i],
439
- "bbox": detections_ft["bboxes"][i],
440
- "mask": all_masks[i, :, :].astype(np.uint8),
447
+ "label": detection["labels"][i],
448
+ "bbox": normalize_bbox(
449
+ detection["bboxes"][i], detection["masks"][i]["size"]
450
+ ),
451
+ "mask": rle_decode_array(detection["masks"][i]),
441
452
  }
442
453
  )
443
454
  return return_data
@@ -451,6 +462,7 @@ def florence2_sam2_image(
451
462
  detections: Dict[str, Any] = send_inference_request(
452
463
  payload, "florence2-sam2", files=files, v2=True
453
464
  )
465
+
454
466
  return_data = []
455
467
  for _, data_i in detections["0"].items():
456
468
  mask = rle_decode_array(data_i["mask"])
@@ -688,22 +700,18 @@ def countgd_counting(
688
700
  {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
689
701
  ]
690
702
  """
691
- buffer_bytes = numpy_to_bytes(image)
692
- files = [("image", buffer_bytes)]
703
+ image_b64 = convert_to_b64(image)
693
704
  prompt = prompt.replace(", ", " .")
694
- payload = {"prompts": [prompt], "model": "countgd"}
705
+ payload = {"prompt": prompt, "image": image_b64}
695
706
  metadata = {"function_name": "countgd_counting"}
696
- resp_data = send_task_inference_request(
697
- payload, "text-to-object-detection", files=files, metadata=metadata
698
- )
699
- bboxes_per_frame = resp_data[0]
707
+ resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
700
708
  bboxes_formatted = [
701
709
  ODResponseData(
702
710
  label=bbox["label"],
703
- bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
711
+ bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
704
712
  score=round(bbox["score"], 2),
705
713
  )
706
- for bbox in bboxes_per_frame
714
+ for bbox in resp_data
707
715
  ]
708
716
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
709
717
  return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -887,7 +895,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
887
895
  "function_name": "ixc25_temporal_localization",
888
896
  }
889
897
  data: List[int] = send_inference_request(
890
- payload, "video-temporal-localization", files=files, v2=True
898
+ payload,
899
+ "video-temporal-localization?model=internlm-xcomposer",
900
+ files=files,
901
+ v2=True,
891
902
  )
892
903
  chunk_size = round(len(frames) / len(data))
893
904
  data_explode = [[elt] * chunk_size for elt in data]
@@ -1132,13 +1143,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1132
1143
  return answer[task] # type: ignore
1133
1144
 
1134
1145
 
1135
- def florence2_phrase_grounding(
1146
+ def florence2_phrase_grounding_image(
1136
1147
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1137
1148
  ) -> List[Dict[str, Any]]:
1138
- """'florence2_phrase_grounding' is a tool that can detect multiple
1139
- objects given a text prompt which can be object names or caption. You
1140
- can optionally separate the object names in the text with commas. It returns a list
1141
- of bounding boxes with normalized coordinates, label names and associated
1149
+ """'florence2_phrase_grounding_image' will run florence2 on a image. It can
1150
+ detect multiple objects given a text prompt which can be object names or caption.
1151
+ You can optionally separate the object names in the text with commas. It returns
1152
+ a list of bounding boxes with normalized coordinates, label names and associated
1142
1153
  probability scores of 1.0.
1143
1154
 
1144
1155
  Parameters:
@@ -1156,7 +1167,7 @@ def florence2_phrase_grounding(
1156
1167
 
1157
1168
  Example
1158
1169
  -------
1159
- >>> florence2_phrase_grounding('person looking at a coyote', image)
1170
+ >>> florence2_phrase_grounding_image('person looking at a coyote', image)
1160
1171
  [
1161
1172
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1162
1173
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1176,39 +1187,128 @@ def florence2_phrase_grounding(
1176
1187
  data_obj = Florence2FtRequest(
1177
1188
  image=image_b64,
1178
1189
  task=PromptTask.PHRASE_GROUNDING,
1179
- tool="florencev2_fine_tuning",
1180
1190
  prompt=prompt,
1181
- fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
1191
+ job_id=UUID(fine_tune_id),
1182
1192
  )
1183
- data = data_obj.model_dump(by_alias=True)
1193
+ data = data_obj.model_dump(by_alias=True, exclude_none=True)
1184
1194
  detections = send_inference_request(
1185
1195
  data,
1186
- "tools",
1187
- v2=False,
1188
- metadata_payload={"function_name": "florence2_phrase_grounding"},
1196
+ "florence2-ft",
1197
+ v2=True,
1198
+ is_form=True,
1199
+ metadata_payload={"function_name": "florence2_phrase_grounding_image"},
1189
1200
  )
1201
+ # get the first frame
1202
+ detection = detections[0]
1190
1203
  else:
1191
1204
  data = {
1192
1205
  "image": image_b64,
1193
1206
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1194
1207
  "prompt": prompt,
1195
- "function_name": "florence2_phrase_grounding",
1208
+ "function_name": "florence2_phrase_grounding_image",
1196
1209
  }
1197
1210
  detections = send_inference_request(data, "florence2", v2=True)
1211
+ detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1198
1212
 
1199
- detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1200
1213
  return_data = []
1201
- for i in range(len(detections["bboxes"])):
1214
+ for i in range(len(detection["bboxes"])):
1202
1215
  return_data.append(
1203
1216
  ODResponseData(
1204
- label=detections["labels"][i],
1205
- bbox=normalize_bbox(detections["bboxes"][i], image_size),
1217
+ label=detection["labels"][i],
1218
+ bbox=normalize_bbox(detection["bboxes"][i], image_size),
1206
1219
  score=1.0,
1207
1220
  )
1208
1221
  )
1209
1222
  return [bbox.model_dump() for bbox in return_data]
1210
1223
 
1211
1224
 
1225
+ def florence2_phrase_grounding_video(
1226
+ prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
1227
+ ) -> List[List[Dict[str, Any]]]:
1228
+ """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
1229
+ It can detect multiple objects given a text prompt which can be object names or
1230
+ caption. You can optionally separate the object names in the text with commas.
1231
+ It returns a list of lists where each inner list contains bounding boxes with
1232
+ normalized coordinates, label names and associated probability scores of 1.0.
1233
+
1234
+ Parameters:
1235
+ prompt (str): The prompt to ground to the video.
1236
+ frames (List[np.ndarray]): The list of frames to detect objects.
1237
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
1238
+ fine-tuned model ID here to use it.
1239
+
1240
+ Returns:
1241
+ List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
1242
+ label, and bounding box of the detected objects with normalized coordinates
1243
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
1244
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
1245
+ the bounding box. The scores are always 1.0 and cannot be thresholded.
1246
+
1247
+ Example
1248
+ -------
1249
+ >>> florence2_phrase_grounding_video('person looking at a coyote', frames)
1250
+ [
1251
+ [
1252
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1253
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
1254
+ ],
1255
+ ...
1256
+ ]
1257
+ """
1258
+ if len(frames) == 0:
1259
+ raise ValueError("No frames provided")
1260
+
1261
+ image_size = frames[0].shape[:2]
1262
+ buffer_bytes = frames_to_bytes(frames)
1263
+ files = [("video", buffer_bytes)]
1264
+
1265
+ if fine_tune_id is not None:
1266
+ landing_api = LandingPublicAPI()
1267
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
1268
+ if status is not JobStatus.SUCCEEDED:
1269
+ raise FineTuneModelIsNotReady(
1270
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
1271
+ )
1272
+
1273
+ data_obj = Florence2FtRequest(
1274
+ task=PromptTask.PHRASE_GROUNDING,
1275
+ prompt=prompt,
1276
+ job_id=UUID(fine_tune_id),
1277
+ )
1278
+
1279
+ data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
1280
+ detections = send_inference_request(
1281
+ data,
1282
+ "florence2-ft",
1283
+ v2=True,
1284
+ files=files,
1285
+ metadata_payload={"function_name": "florence2_phrase_grounding_video"},
1286
+ )
1287
+ else:
1288
+ data = {
1289
+ "prompt": prompt,
1290
+ "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1291
+ "function_name": "florence2_phrase_grounding_video",
1292
+ "video": base64.b64encode(buffer_bytes).decode("utf-8"),
1293
+ }
1294
+ detections = send_inference_request(data, "florence2", v2=True)
1295
+ detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
1296
+
1297
+ bboxes_formatted = []
1298
+ for frame_data in detections:
1299
+ bboxes_formatted_per_frame = []
1300
+ for idx in range(len(frame_data["bboxes"])):
1301
+ bboxes_formatted_per_frame.append(
1302
+ ODResponseData(
1303
+ label=frame_data["labels"][idx],
1304
+ bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
1305
+ score=1.0,
1306
+ )
1307
+ )
1308
+ bboxes_formatted.append(bboxes_formatted_per_frame)
1309
+ return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1310
+
1311
+
1212
1312
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1213
1313
  """'florence2_ocr' is a tool that can detect text and text regions in an image.
1214
1314
  Each text region contains one line of text. It returns a list of detected text,
@@ -1220,7 +1320,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1220
1320
 
1221
1321
  Returns:
1222
1322
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
1223
- with nornmalized coordinates, and confidence score.
1323
+ with normalized coordinates, and confidence score.
1224
1324
 
1225
1325
  Example
1226
1326
  -------
@@ -1603,7 +1703,7 @@ def extract_frames_and_timestamps(
1603
1703
  """
1604
1704
 
1605
1705
  def reformat(
1606
- frames_and_timestamps: List[Tuple[np.ndarray, float]]
1706
+ frames_and_timestamps: List[Tuple[np.ndarray, float]],
1607
1707
  ) -> List[Dict[str, Union[np.ndarray, float]]]:
1608
1708
  return [
1609
1709
  {"frame": frame, "timestamp": timestamp}
@@ -2017,7 +2117,7 @@ def overlay_counting_results(
2017
2117
  fontsize,
2018
2118
  )
2019
2119
 
2020
- for i, elt in enumerate(instances):
2120
+ for i, elt in enumerate(instances, 1):
2021
2121
  label = f"{i}"
2022
2122
  box = elt["bbox"]
2023
2123
 
@@ -2064,7 +2164,8 @@ FUNCTION_TOOLS = [
2064
2164
  florence2_ocr,
2065
2165
  florence2_sam2_image,
2066
2166
  florence2_sam2_video_tracking,
2067
- florence2_phrase_grounding,
2167
+ florence2_phrase_grounding_image,
2168
+ florence2_phrase_grounding_video,
2068
2169
  ixc25_image_vqa,
2069
2170
  ixc25_video_vqa,
2070
2171
  detr_segmentation,
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
- from typing import List, Optional, Tuple, Union
3
2
  from uuid import UUID
3
+ from typing import List, Optional, Tuple, Union
4
4
 
5
5
  from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
6
6
 
@@ -24,27 +24,22 @@ class PromptTask(str, Enum):
24
24
  PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
25
25
 
26
26
 
27
- class FineTuning(BaseModel):
27
+ class Florence2FtRequest(BaseModel):
28
28
  model_config = ConfigDict(populate_by_name=True)
29
29
 
30
- job_id: UUID = Field(alias="jobId")
30
+ image: Optional[str] = None
31
+ video: Optional[bytes] = None
32
+ task: PromptTask
33
+ prompt: Optional[str] = ""
34
+ chunk_length_frames: Optional[int] = None
31
35
  postprocessing: Optional[str] = None
36
+ job_id: Optional[UUID] = Field(None, alias="jobId")
32
37
 
33
38
  @field_serializer("job_id")
34
39
  def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
35
40
  return str(job_id)
36
41
 
37
42
 
38
- class Florence2FtRequest(BaseModel):
39
- model_config = ConfigDict(populate_by_name=True)
40
-
41
- image: str
42
- task: PromptTask
43
- tool: str
44
- prompt: Optional[str] = ""
45
- fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
46
-
47
-
48
43
  class JobStatus(str, Enum):
49
44
  """The status of a fine-tuning job.
50
45
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.151
3
+ Version: 0.2.153
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
4
4
  vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
5
5
  vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
8
- vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=Ea_v_qLBJMVwQVLLIdNq15MgV2-6qqhcThHAHFwzv-o,18940
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=eOqluRb1R_SJFsdWXd9HJuiJnJccEnDDUkfPXlHOjyw,11293
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
11
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -14,12 +14,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
15
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
18
- vision_agent/tools/meta_tools.py,sha256=yBlkRTeEfI3sAMZbz5mvOsHu9e1OrzDw6XLd6t-U0IY,24909
17
+ vision_agent/tools/__init__.py,sha256=cg4Axb9L3Z7WkdyEv5IyqDsmZKIrxmS4CmV3DEXURnU,2418
18
+ vision_agent/tools/meta_tools.py,sha256=yrplxiDu-L9_Dw_L2ESehJabckAq59Q-xfMpIbYB0Ak,25179
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
- vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
21
- vision_agent/tools/tools.py,sha256=c7SjtZD7YfxhEAGYYe-ExVCBA4NDXmRwerBIbd-XEH8,74557
22
- vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
20
+ vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
21
+ vision_agent/tools/tools.py,sha256=Of7NTZTc1bim_fdAoDxx47WzttGI8VlMKKcId0sMwfk,78406
22
+ vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
23
23
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
25
  vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4,28017
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
27
27
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
- vision_agent-0.2.151.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.151.dist-info/METADATA,sha256=61jba11RSszH3vWXJi2_CoqbwaXEqSTCcJWakNNFBTU,13758
32
- vision_agent-0.2.151.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.151.dist-info/RECORD,,
30
+ vision_agent-0.2.153.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.153.dist-info/METADATA,sha256=zehWh4l1EfZeTKxSEgKXtQMb0EE5pvWP1UG0d2lyS44,13758
32
+ vision_agent-0.2.153.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.153.dist-info/RECORD,,