vision-agent 0.2.151__py3-none-any.whl → 0.2.153__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,7 +101,7 @@ plan1:
101
101
  - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
102
102
  plan2:
103
103
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
104
- - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
104
+ - Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
105
105
  plan3:
106
106
  - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
107
107
  - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
109
109
 
110
110
  ```python
111
111
  import numpy as np
112
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
112
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
113
113
 
114
114
  # sample at 1 FPS and use the first 10 frames to reduce processing time
115
115
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
143
143
  owl_v2_counts = get_counts(owl_v2_out)
144
144
 
145
145
  # plan2
146
- florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
146
+ florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
147
147
  florence2_counts = get_counts(florence2_out)
148
148
 
149
149
  # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
153
153
 
154
154
  final_out = {{
155
155
  "owl_v2_video": owl_v2_out,
156
- "florence2_phrase_grounding": florence2_out,
156
+ "florence2_phrase_grounding_image": florence2_out,
157
157
  "florence2_sam2_video_tracking": f2s2_out,
158
158
  }}
159
159
 
160
160
  counts = {{
161
161
  "owl_v2_video": owl_v2_counts,
162
- "florence2_phrase_grounding": florence2_counts,
162
+ "florence2_phrase_grounding_image": florence2_counts,
163
163
  "florence2_sam2_video_tracking": f2s2_counts,
164
164
  }}
165
165
 
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
131
131
 
132
132
  OBSERVATION:
133
133
  [Artifact code.py]
134
- 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
134
+ 0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
135
135
  1|def count_workers_with_helmets(image_path: str, output_path: str):
136
136
  2| image = load_image(image_path)
137
- 3| detections = florence2_phrase_grounding("worker, helmet", image)
137
+ 3| detections = florence2_phrase_grounding_image("worker, helmet", image)
138
138
  4| workers = [d for d in detections if d['label'] == 'worker']
139
139
  5| helmets = [d for d in detections if d['label'] == 'helmet']
140
140
  6| count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
166
166
  OBSERVATION:
167
167
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
168
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
169
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
170
 
171
171
  OBSERVATION:
172
172
  [Artifact code.py edits]
173
173
  ---
174
174
  +++
175
175
  @@ -1,7 +1,7 @@
176
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
176
+ from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
177
177
  def count_workers_with_helmets(image_path: str, output_path: str):
178
178
  image = load_image(image_path)
179
- - detections = florence2_phrase_grounding("worker, helmet", image)
180
- + detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
179
+ - detections = florence2_phrase_grounding_image("worker, helmet", image)
180
+ + detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
181
181
  workers = [d for d in detections if d['label'] == 'worker']
182
182
  helmets = [d for d in detections if d['label'] == 'helmet']
183
183
  count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
189
189
  ----- stdout -----
190
190
  3
191
191
 
192
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
192
+ AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
193
193
  """
@@ -24,7 +24,7 @@ from .tools import (
24
24
  extract_frames_and_timestamps,
25
25
  florence2_image_caption,
26
26
  florence2_ocr,
27
- florence2_phrase_grounding,
27
+ florence2_phrase_grounding_image,
28
28
  florence2_roberta_vqa,
29
29
  florence2_sam2_image,
30
30
  florence2_sam2_video_tracking,
@@ -668,8 +668,12 @@ def use_object_detection_fine_tuning(
668
668
 
669
669
  patterns_with_fine_tune_id = [
670
670
  (
671
- r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
- lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
671
+ r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
672
+ lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
+ ),
674
+ (
675
+ r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
676
+ lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
673
677
  ),
674
678
  (
675
679
  r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
@@ -1,6 +1,6 @@
1
+ import os
1
2
  import inspect
2
3
  import logging
3
- import os
4
4
  from base64 import b64encode
5
5
  from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
6
6
 
@@ -37,8 +37,9 @@ def send_inference_request(
37
37
  files: Optional[List[Tuple[Any, ...]]] = None,
38
38
  v2: bool = False,
39
39
  metadata_payload: Optional[Dict[str, Any]] = None,
40
+ is_form: bool = False,
40
41
  ) -> Any:
41
- # TODO: runtime_tag and function_name should be metadata_payload and now included
42
+ # TODO: runtime_tag and function_name should be metadata_payload and not included
42
43
  # in the service payload
43
44
  if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
44
45
  payload["runtime_tag"] = runtime_tag
@@ -64,7 +65,7 @@ def send_inference_request(
64
65
  elif metadata_payload is not None and "function_name" in metadata_payload:
65
66
  function_name = metadata_payload["function_name"]
66
67
 
67
- response = _call_post(url, payload, session, files, function_name)
68
+ response = _call_post(url, payload, session, files, function_name, is_form)
68
69
 
69
70
  # TODO: consider making the response schema the same between below two sources
70
71
  return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
@@ -75,6 +76,7 @@ def send_task_inference_request(
75
76
  task_name: str,
76
77
  files: Optional[List[Tuple[Any, ...]]] = None,
77
78
  metadata: Optional[Dict[str, Any]] = None,
79
+ is_form: bool = False,
78
80
  ) -> Any:
79
81
  url = f"{_LND_API_URL_v2}/{task_name}"
80
82
  headers = {"apikey": _LND_API_KEY}
@@ -87,7 +89,7 @@ def send_task_inference_request(
87
89
  function_name = "unknown"
88
90
  if metadata is not None and "function_name" in metadata:
89
91
  function_name = metadata["function_name"]
90
- response = _call_post(url, payload, session, files, function_name)
92
+ response = _call_post(url, payload, session, files, function_name, is_form)
91
93
  return response["data"]
92
94
 
93
95
 
@@ -203,6 +205,7 @@ def _call_post(
203
205
  session: Session,
204
206
  files: Optional[List[Tuple[Any, ...]]] = None,
205
207
  function_name: str = "unknown",
208
+ is_form: bool = False,
206
209
  ) -> Any:
207
210
  files_in_b64 = None
208
211
  if files:
@@ -210,6 +213,8 @@ def _call_post(
210
213
  try:
211
214
  if files is not None:
212
215
  response = session.post(url, data=payload, files=files)
216
+ elif is_form:
217
+ response = session.post(url, data=payload)
213
218
  else:
214
219
  response = session.post(url, json=payload)
215
220
 
@@ -1,3 +1,4 @@
1
+ import base64
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -28,7 +29,6 @@ from vision_agent.tools.tool_utils import (
28
29
  send_task_inference_request,
29
30
  )
30
31
  from vision_agent.tools.tools_types import (
31
- FineTuning,
32
32
  Florence2FtRequest,
33
33
  JobStatus,
34
34
  ODResponseData,
@@ -194,20 +194,26 @@ def owl_v2_image(
194
194
  data_obj = Florence2FtRequest(
195
195
  image=image_b64,
196
196
  task=PromptTask.PHRASE_GROUNDING,
197
- tool="florencev2_fine_tuning",
198
197
  prompt=prompt,
199
- fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
198
+ job_id=UUID(fine_tune_id),
200
199
  )
201
- data = data_obj.model_dump(by_alias=True)
202
- detections = send_inference_request(data, "tools", v2=False)
203
- detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
200
+ data = data_obj.model_dump(by_alias=True, exclude_none=True)
201
+ detections = send_inference_request(
202
+ data,
203
+ "florence2-ft",
204
+ v2=True,
205
+ is_form=True,
206
+ metadata_payload={"function_name": "owl_v2_image"},
207
+ )
208
+ # get the first frame
209
+ detection = detections[0]
204
210
  bboxes_formatted = [
205
211
  ODResponseData(
206
- label=detections["labels"][i],
207
- bbox=normalize_bbox(detections["bboxes"][i], image_size),
212
+ label=detection["labels"][i],
213
+ bbox=normalize_bbox(detection["bboxes"][i], image_size),
208
214
  score=1.0,
209
215
  )
210
- for i in range(len(detections["bboxes"]))
216
+ for i in range(len(detection["bboxes"]))
211
217
  ]
212
218
  return [bbox.model_dump() for bbox in bboxes_formatted]
213
219
 
@@ -419,25 +425,30 @@ def florence2_sam2_image(
419
425
  req_data_obj = Florence2FtRequest(
420
426
  image=image_b64,
421
427
  task=PromptTask.PHRASE_GROUNDING,
422
- tool="florencev2_fine_tuning",
423
428
  prompt=prompt,
424
- fine_tuning=FineTuning(
425
- job_id=UUID(fine_tune_id),
426
- postprocessing="sam2",
427
- ),
429
+ postprocessing="sam2",
430
+ job_id=UUID(fine_tune_id),
431
+ )
432
+ req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
433
+ detections_ft = send_inference_request(
434
+ req_data,
435
+ "florence2-ft",
436
+ v2=True,
437
+ is_form=True,
438
+ metadata_payload={"function_name": "florence2_sam2_image"},
428
439
  )
429
- req_data = req_data_obj.model_dump(by_alias=True)
430
- detections_ft = send_inference_request(req_data, "tools", v2=False)
431
- detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
440
+ # get the first frame
441
+ detection = detections_ft[0]
432
442
  return_data = []
433
- all_masks = np.array(detections_ft["masks"])
434
- for i in range(len(detections_ft["bboxes"])):
443
+ for i in range(len(detection["bboxes"])):
435
444
  return_data.append(
436
445
  {
437
446
  "score": 1.0,
438
- "label": detections_ft["labels"][i],
439
- "bbox": detections_ft["bboxes"][i],
440
- "mask": all_masks[i, :, :].astype(np.uint8),
447
+ "label": detection["labels"][i],
448
+ "bbox": normalize_bbox(
449
+ detection["bboxes"][i], detection["masks"][i]["size"]
450
+ ),
451
+ "mask": rle_decode_array(detection["masks"][i]),
441
452
  }
442
453
  )
443
454
  return return_data
@@ -451,6 +462,7 @@ def florence2_sam2_image(
451
462
  detections: Dict[str, Any] = send_inference_request(
452
463
  payload, "florence2-sam2", files=files, v2=True
453
464
  )
465
+
454
466
  return_data = []
455
467
  for _, data_i in detections["0"].items():
456
468
  mask = rle_decode_array(data_i["mask"])
@@ -688,22 +700,18 @@ def countgd_counting(
688
700
  {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
689
701
  ]
690
702
  """
691
- buffer_bytes = numpy_to_bytes(image)
692
- files = [("image", buffer_bytes)]
703
+ image_b64 = convert_to_b64(image)
693
704
  prompt = prompt.replace(", ", " .")
694
- payload = {"prompts": [prompt], "model": "countgd"}
705
+ payload = {"prompt": prompt, "image": image_b64}
695
706
  metadata = {"function_name": "countgd_counting"}
696
- resp_data = send_task_inference_request(
697
- payload, "text-to-object-detection", files=files, metadata=metadata
698
- )
699
- bboxes_per_frame = resp_data[0]
707
+ resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
700
708
  bboxes_formatted = [
701
709
  ODResponseData(
702
710
  label=bbox["label"],
703
- bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
711
+ bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
704
712
  score=round(bbox["score"], 2),
705
713
  )
706
- for bbox in bboxes_per_frame
714
+ for bbox in resp_data
707
715
  ]
708
716
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
709
717
  return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -887,7 +895,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
887
895
  "function_name": "ixc25_temporal_localization",
888
896
  }
889
897
  data: List[int] = send_inference_request(
890
- payload, "video-temporal-localization", files=files, v2=True
898
+ payload,
899
+ "video-temporal-localization?model=internlm-xcomposer",
900
+ files=files,
901
+ v2=True,
891
902
  )
892
903
  chunk_size = round(len(frames) / len(data))
893
904
  data_explode = [[elt] * chunk_size for elt in data]
@@ -1132,13 +1143,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1132
1143
  return answer[task] # type: ignore
1133
1144
 
1134
1145
 
1135
- def florence2_phrase_grounding(
1146
+ def florence2_phrase_grounding_image(
1136
1147
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1137
1148
  ) -> List[Dict[str, Any]]:
1138
- """'florence2_phrase_grounding' is a tool that can detect multiple
1139
- objects given a text prompt which can be object names or caption. You
1140
- can optionally separate the object names in the text with commas. It returns a list
1141
- of bounding boxes with normalized coordinates, label names and associated
1149
+ """'florence2_phrase_grounding_image' will run florence2 on a image. It can
1150
+ detect multiple objects given a text prompt which can be object names or caption.
1151
+ You can optionally separate the object names in the text with commas. It returns
1152
+ a list of bounding boxes with normalized coordinates, label names and associated
1142
1153
  probability scores of 1.0.
1143
1154
 
1144
1155
  Parameters:
@@ -1156,7 +1167,7 @@ def florence2_phrase_grounding(
1156
1167
 
1157
1168
  Example
1158
1169
  -------
1159
- >>> florence2_phrase_grounding('person looking at a coyote', image)
1170
+ >>> florence2_phrase_grounding_image('person looking at a coyote', image)
1160
1171
  [
1161
1172
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1162
1173
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1176,39 +1187,128 @@ def florence2_phrase_grounding(
1176
1187
  data_obj = Florence2FtRequest(
1177
1188
  image=image_b64,
1178
1189
  task=PromptTask.PHRASE_GROUNDING,
1179
- tool="florencev2_fine_tuning",
1180
1190
  prompt=prompt,
1181
- fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
1191
+ job_id=UUID(fine_tune_id),
1182
1192
  )
1183
- data = data_obj.model_dump(by_alias=True)
1193
+ data = data_obj.model_dump(by_alias=True, exclude_none=True)
1184
1194
  detections = send_inference_request(
1185
1195
  data,
1186
- "tools",
1187
- v2=False,
1188
- metadata_payload={"function_name": "florence2_phrase_grounding"},
1196
+ "florence2-ft",
1197
+ v2=True,
1198
+ is_form=True,
1199
+ metadata_payload={"function_name": "florence2_phrase_grounding_image"},
1189
1200
  )
1201
+ # get the first frame
1202
+ detection = detections[0]
1190
1203
  else:
1191
1204
  data = {
1192
1205
  "image": image_b64,
1193
1206
  "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1194
1207
  "prompt": prompt,
1195
- "function_name": "florence2_phrase_grounding",
1208
+ "function_name": "florence2_phrase_grounding_image",
1196
1209
  }
1197
1210
  detections = send_inference_request(data, "florence2", v2=True)
1211
+ detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1198
1212
 
1199
- detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1200
1213
  return_data = []
1201
- for i in range(len(detections["bboxes"])):
1214
+ for i in range(len(detection["bboxes"])):
1202
1215
  return_data.append(
1203
1216
  ODResponseData(
1204
- label=detections["labels"][i],
1205
- bbox=normalize_bbox(detections["bboxes"][i], image_size),
1217
+ label=detection["labels"][i],
1218
+ bbox=normalize_bbox(detection["bboxes"][i], image_size),
1206
1219
  score=1.0,
1207
1220
  )
1208
1221
  )
1209
1222
  return [bbox.model_dump() for bbox in return_data]
1210
1223
 
1211
1224
 
1225
+ def florence2_phrase_grounding_video(
1226
+ prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
1227
+ ) -> List[List[Dict[str, Any]]]:
1228
+ """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
1229
+ It can detect multiple objects given a text prompt which can be object names or
1230
+ caption. You can optionally separate the object names in the text with commas.
1231
+ It returns a list of lists where each inner list contains bounding boxes with
1232
+ normalized coordinates, label names and associated probability scores of 1.0.
1233
+
1234
+ Parameters:
1235
+ prompt (str): The prompt to ground to the video.
1236
+ frames (List[np.ndarray]): The list of frames to detect objects.
1237
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
1238
+ fine-tuned model ID here to use it.
1239
+
1240
+ Returns:
1241
+ List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
1242
+ label, and bounding box of the detected objects with normalized coordinates
1243
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
1244
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
1245
+ the bounding box. The scores are always 1.0 and cannot be thresholded.
1246
+
1247
+ Example
1248
+ -------
1249
+ >>> florence2_phrase_grounding_video('person looking at a coyote', frames)
1250
+ [
1251
+ [
1252
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1253
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
1254
+ ],
1255
+ ...
1256
+ ]
1257
+ """
1258
+ if len(frames) == 0:
1259
+ raise ValueError("No frames provided")
1260
+
1261
+ image_size = frames[0].shape[:2]
1262
+ buffer_bytes = frames_to_bytes(frames)
1263
+ files = [("video", buffer_bytes)]
1264
+
1265
+ if fine_tune_id is not None:
1266
+ landing_api = LandingPublicAPI()
1267
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
1268
+ if status is not JobStatus.SUCCEEDED:
1269
+ raise FineTuneModelIsNotReady(
1270
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
1271
+ )
1272
+
1273
+ data_obj = Florence2FtRequest(
1274
+ task=PromptTask.PHRASE_GROUNDING,
1275
+ prompt=prompt,
1276
+ job_id=UUID(fine_tune_id),
1277
+ )
1278
+
1279
+ data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
1280
+ detections = send_inference_request(
1281
+ data,
1282
+ "florence2-ft",
1283
+ v2=True,
1284
+ files=files,
1285
+ metadata_payload={"function_name": "florence2_phrase_grounding_video"},
1286
+ )
1287
+ else:
1288
+ data = {
1289
+ "prompt": prompt,
1290
+ "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1291
+ "function_name": "florence2_phrase_grounding_video",
1292
+ "video": base64.b64encode(buffer_bytes).decode("utf-8"),
1293
+ }
1294
+ detections = send_inference_request(data, "florence2", v2=True)
1295
+ detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
1296
+
1297
+ bboxes_formatted = []
1298
+ for frame_data in detections:
1299
+ bboxes_formatted_per_frame = []
1300
+ for idx in range(len(frame_data["bboxes"])):
1301
+ bboxes_formatted_per_frame.append(
1302
+ ODResponseData(
1303
+ label=frame_data["labels"][idx],
1304
+ bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
1305
+ score=1.0,
1306
+ )
1307
+ )
1308
+ bboxes_formatted.append(bboxes_formatted_per_frame)
1309
+ return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1310
+
1311
+
1212
1312
  def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1213
1313
  """'florence2_ocr' is a tool that can detect text and text regions in an image.
1214
1314
  Each text region contains one line of text. It returns a list of detected text,
@@ -1220,7 +1320,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
1220
1320
 
1221
1321
  Returns:
1222
1322
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
1223
- with nornmalized coordinates, and confidence score.
1323
+ with normalized coordinates, and confidence score.
1224
1324
 
1225
1325
  Example
1226
1326
  -------
@@ -1603,7 +1703,7 @@ def extract_frames_and_timestamps(
1603
1703
  """
1604
1704
 
1605
1705
  def reformat(
1606
- frames_and_timestamps: List[Tuple[np.ndarray, float]]
1706
+ frames_and_timestamps: List[Tuple[np.ndarray, float]],
1607
1707
  ) -> List[Dict[str, Union[np.ndarray, float]]]:
1608
1708
  return [
1609
1709
  {"frame": frame, "timestamp": timestamp}
@@ -2017,7 +2117,7 @@ def overlay_counting_results(
2017
2117
  fontsize,
2018
2118
  )
2019
2119
 
2020
- for i, elt in enumerate(instances):
2120
+ for i, elt in enumerate(instances, 1):
2021
2121
  label = f"{i}"
2022
2122
  box = elt["bbox"]
2023
2123
 
@@ -2064,7 +2164,8 @@ FUNCTION_TOOLS = [
2064
2164
  florence2_ocr,
2065
2165
  florence2_sam2_image,
2066
2166
  florence2_sam2_video_tracking,
2067
- florence2_phrase_grounding,
2167
+ florence2_phrase_grounding_image,
2168
+ florence2_phrase_grounding_video,
2068
2169
  ixc25_image_vqa,
2069
2170
  ixc25_video_vqa,
2070
2171
  detr_segmentation,
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
- from typing import List, Optional, Tuple, Union
3
2
  from uuid import UUID
3
+ from typing import List, Optional, Tuple, Union
4
4
 
5
5
  from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
6
6
 
@@ -24,27 +24,22 @@ class PromptTask(str, Enum):
24
24
  PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
25
25
 
26
26
 
27
- class FineTuning(BaseModel):
27
+ class Florence2FtRequest(BaseModel):
28
28
  model_config = ConfigDict(populate_by_name=True)
29
29
 
30
- job_id: UUID = Field(alias="jobId")
30
+ image: Optional[str] = None
31
+ video: Optional[bytes] = None
32
+ task: PromptTask
33
+ prompt: Optional[str] = ""
34
+ chunk_length_frames: Optional[int] = None
31
35
  postprocessing: Optional[str] = None
36
+ job_id: Optional[UUID] = Field(None, alias="jobId")
32
37
 
33
38
  @field_serializer("job_id")
34
39
  def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
35
40
  return str(job_id)
36
41
 
37
42
 
38
- class Florence2FtRequest(BaseModel):
39
- model_config = ConfigDict(populate_by_name=True)
40
-
41
- image: str
42
- task: PromptTask
43
- tool: str
44
- prompt: Optional[str] = ""
45
- fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
46
-
47
-
48
43
  class JobStatus(str, Enum):
49
44
  """The status of a fine-tuning job.
50
45
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.151
3
+ Version: 0.2.153
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
4
4
  vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
5
5
  vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
8
- vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=Ea_v_qLBJMVwQVLLIdNq15MgV2-6qqhcThHAHFwzv-o,18940
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=eOqluRb1R_SJFsdWXd9HJuiJnJccEnDDUkfPXlHOjyw,11293
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
11
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -14,12 +14,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
15
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
18
- vision_agent/tools/meta_tools.py,sha256=yBlkRTeEfI3sAMZbz5mvOsHu9e1OrzDw6XLd6t-U0IY,24909
17
+ vision_agent/tools/__init__.py,sha256=cg4Axb9L3Z7WkdyEv5IyqDsmZKIrxmS4CmV3DEXURnU,2418
18
+ vision_agent/tools/meta_tools.py,sha256=yrplxiDu-L9_Dw_L2ESehJabckAq59Q-xfMpIbYB0Ak,25179
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
- vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
21
- vision_agent/tools/tools.py,sha256=c7SjtZD7YfxhEAGYYe-ExVCBA4NDXmRwerBIbd-XEH8,74557
22
- vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
20
+ vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
21
+ vision_agent/tools/tools.py,sha256=Of7NTZTc1bim_fdAoDxx47WzttGI8VlMKKcId0sMwfk,78406
22
+ vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
23
23
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
25
  vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4,28017
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
27
27
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
- vision_agent-0.2.151.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.151.dist-info/METADATA,sha256=61jba11RSszH3vWXJi2_CoqbwaXEqSTCcJWakNNFBTU,13758
32
- vision_agent-0.2.151.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.151.dist-info/RECORD,,
30
+ vision_agent-0.2.153.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.153.dist-info/METADATA,sha256=zehWh4l1EfZeTKxSEgKXtQMb0EE5pvWP1UG0d2lyS44,13758
32
+ vision_agent-0.2.153.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.153.dist-info/RECORD,,