vision-agent 0.2.224__py3-none-any.whl → 0.2.225__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,7 +6,6 @@ import tempfile
6
6
  import urllib.request
7
7
  from base64 import b64encode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from enum import Enum
10
9
  from importlib import resources
11
10
  from pathlib import Path
12
11
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -54,6 +53,13 @@ from vision_agent.utils.video import (
54
53
  frames_to_bytes,
55
54
  video_writer,
56
55
  )
56
+ from vision_agent.utils.video_tracking import (
57
+ ODModels,
58
+ merge_segments,
59
+ post_process,
60
+ process_segment,
61
+ split_frames_into_segments,
62
+ )
57
63
 
58
64
  register_heif_opener()
59
65
 
@@ -224,12 +230,6 @@ def sam2(
224
230
  return ret["return_data"] # type: ignore
225
231
 
226
232
 
227
- class ODModels(str, Enum):
228
- COUNTGD = "countgd"
229
- FLORENCE2 = "florence2"
230
- OWLV2 = "owlv2"
231
-
232
-
233
233
  def od_sam2_video_tracking(
234
234
  od_model: ODModels,
235
235
  prompt: str,
@@ -237,105 +237,92 @@ def od_sam2_video_tracking(
237
237
  chunk_length: Optional[int] = 10,
238
238
  fine_tune_id: Optional[str] = None,
239
239
  ) -> Dict[str, Any]:
240
- results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
240
+ SEGMENT_SIZE = 50
241
+ OVERLAP = 1 # Number of overlapping frames between segments
241
242
 
242
- if chunk_length is None:
243
- step = 1 # Process every frame
244
- elif chunk_length <= 0:
245
- raise ValueError("chunk_length must be a positive integer or None.")
246
- else:
247
- step = chunk_length # Process frames with the specified step size
243
+ image_size = frames[0].shape[:2]
244
+
245
+ # Split frames into segments with overlap
246
+ segments = split_frames_into_segments(frames, SEGMENT_SIZE, OVERLAP)
247
+
248
+ def _apply_object_detection( # inner method to avoid circular importing issues.
249
+ od_model: ODModels,
250
+ prompt: str,
251
+ segment_index: int,
252
+ frame_number: int,
253
+ fine_tune_id: str,
254
+ segment_frames: list,
255
+ ) -> tuple:
256
+ """
257
+ Applies the specified object detection model to the given image.
258
+
259
+ Args:
260
+ od_model: The object detection model to use.
261
+ prompt: The prompt for the object detection model.
262
+ segment_index: The index of the current segment.
263
+ frame_number: The number of the current frame.
264
+ fine_tune_id: Optional fine-tune ID for the model.
265
+ segment_frames: List of frames for the current segment.
266
+
267
+ Returns:
268
+ A tuple containing the object detection results and the name of the function used.
269
+ """
248
270
 
249
- for idx in range(0, len(frames), step):
250
271
  if od_model == ODModels.COUNTGD:
251
- results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
272
+ segment_results = countgd_object_detection(
273
+ prompt=prompt, image=segment_frames[frame_number]
274
+ )
252
275
  function_name = "countgd_object_detection"
276
+
253
277
  elif od_model == ODModels.OWLV2:
254
- results[idx] = owlv2_object_detection(
255
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
278
+ segment_results = owlv2_object_detection(
279
+ prompt=prompt,
280
+ image=segment_frames[frame_number],
281
+ fine_tune_id=fine_tune_id,
256
282
  )
257
283
  function_name = "owlv2_object_detection"
284
+
258
285
  elif od_model == ODModels.FLORENCE2:
259
- results[idx] = florence2_object_detection(
260
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
286
+ segment_results = florence2_object_detection(
287
+ prompt=prompt,
288
+ image=segment_frames[frame_number],
289
+ fine_tune_id=fine_tune_id,
261
290
  )
262
291
  function_name = "florence2_object_detection"
292
+
263
293
  else:
264
294
  raise NotImplementedError(
265
295
  f"Object detection model '{od_model}' is not implemented."
266
296
  )
267
297
 
268
- image_size = frames[0].shape[:2]
269
-
270
- def _transform_detections(
271
- input_list: List[Optional[List[Dict[str, Any]]]],
272
- ) -> List[Optional[Dict[str, Any]]]:
273
- output_list: List[Optional[Dict[str, Any]]] = []
274
-
275
- for _, frame in enumerate(input_list):
276
- if frame is not None:
277
- labels = [detection["label"] for detection in frame]
278
- bboxes = [
279
- denormalize_bbox(detection["bbox"], image_size)
280
- for detection in frame
281
- ]
282
-
283
- output_list.append(
284
- {
285
- "labels": labels,
286
- "bboxes": bboxes,
287
- }
288
- )
289
- else:
290
- output_list.append(None)
291
-
292
- return output_list
298
+ return segment_results, function_name
299
+
300
+ # Process each segment and collect detections
301
+ detections_per_segment: List[Any] = []
302
+ for segment_index, segment in enumerate(segments):
303
+ segment_detections = process_segment(
304
+ segment_frames=segment,
305
+ od_model=od_model,
306
+ prompt=prompt,
307
+ fine_tune_id=fine_tune_id,
308
+ chunk_length=chunk_length,
309
+ image_size=image_size,
310
+ segment_index=segment_index,
311
+ object_detection_tool=_apply_object_detection,
312
+ )
313
+ detections_per_segment.append(segment_detections)
293
314
 
294
- output = _transform_detections(results)
315
+ merged_detections = merge_segments(detections_per_segment)
316
+ post_processed = post_process(merged_detections, image_size)
295
317
 
296
318
  buffer_bytes = frames_to_bytes(frames)
297
319
  files = [("video", buffer_bytes)]
298
- payload = {"bboxes": json.dumps(output), "chunk_length_frames": chunk_length}
299
- metadata = {"function_name": function_name}
300
-
301
- detections = send_task_inference_request(
302
- payload,
303
- "sam2",
304
- files=files,
305
- metadata=metadata,
306
- )
307
320
 
308
- return_data = []
309
- for frame in detections:
310
- return_frame_data = []
311
- for detection in frame:
312
- mask = rle_decode_array(detection["mask"])
313
- label = str(detection["id"]) + ": " + detection["label"]
314
- return_frame_data.append(
315
- {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
316
- )
317
- return_data.append(return_frame_data)
318
- return_data = add_bboxes_from_masks(return_data)
319
- return_data = nms(return_data, iou_threshold=0.95)
320
-
321
- # We save the RLE for display purposes, re-calculting RLE can get very expensive.
322
- # Deleted here because we are returning the numpy masks instead
323
- display_data = []
324
- for frame in return_data:
325
- display_frame_data = []
326
- for obj in frame:
327
- display_frame_data.append(
328
- {
329
- "label": obj["label"],
330
- "score": obj["score"],
331
- "bbox": denormalize_bbox(obj["bbox"], image_size),
332
- "mask": obj["rle"],
333
- }
334
- )
335
- del obj["rle"]
336
- display_data.append(display_frame_data)
337
-
338
- return {"files": files, "return_data": return_data, "display_data": detections}
321
+ return {
322
+ "files": files,
323
+ "return_data": post_processed["return_data"],
324
+ "display_data": post_processed["display_data"],
325
+ }
339
326
 
340
327
 
341
328
  # Owl V2 Tools
@@ -0,0 +1,305 @@
1
+ import json
2
+ from enum import Enum
3
+ from typing import Any, Callable, Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+
7
+ from vision_agent.tools.tool_utils import (
8
+ add_bboxes_from_masks,
9
+ nms,
10
+ send_task_inference_request,
11
+ )
12
+ from vision_agent.utils.image_utils import denormalize_bbox, rle_decode_array
13
+ from vision_agent.utils.video import frames_to_bytes
14
+
15
+
16
+ class ODModels(str, Enum):
17
+ COUNTGD = "countgd"
18
+ FLORENCE2 = "florence2"
19
+ OWLV2 = "owlv2"
20
+
21
+
22
+ def split_frames_into_segments(
23
+ frames: List[np.ndarray], segment_size: int = 50, overlap: int = 1
24
+ ) -> List[List[np.ndarray]]:
25
+ """
26
+ Splits the list of frames into segments with a specified size and overlap.
27
+
28
+ Args:
29
+ frames (List[np.ndarray]): List of video frames.
30
+ segment_size (int, optional): Number of frames per segment. Defaults to 50.
31
+ overlap (int, optional): Number of overlapping frames between segments. Defaults to 1.
32
+
33
+ Returns:
34
+ List[List[np.ndarray]]: List of frame segments.
35
+ """
36
+ segments = []
37
+ start = 0
38
+ segment_count = 0
39
+ while start < len(frames):
40
+ end = start + segment_size
41
+ if end > len(frames):
42
+ end = len(frames)
43
+ if start != 0:
44
+ # Include the last frame of the previous segment
45
+ segment = frames[start - overlap : end]
46
+ else:
47
+ segment = frames[start:end]
48
+ segments.append(segment)
49
+ start += segment_size
50
+ segment_count += 1
51
+ return segments
52
+
53
+
54
+ def process_segment(
55
+ segment_frames: List[np.ndarray],
56
+ od_model: ODModels,
57
+ prompt: str,
58
+ fine_tune_id: Optional[str],
59
+ chunk_length: Optional[int],
60
+ image_size: Tuple[int, ...],
61
+ segment_index: int,
62
+ object_detection_tool: Callable,
63
+ ) -> Any:
64
+ """
65
+ Processes a segment of frames with the specified object detection model.
66
+
67
+ Args:
68
+ segment_frames (List[np.ndarray]): Frames in the segment.
69
+ od_model (ODModels): Object detection model to use.
70
+ prompt (str): Prompt for the model.
71
+ fine_tune_id (Optional[str]): Fine-tune model ID.
72
+ chunk_length (Optional[int]): Chunk length for processing.
73
+ image_size (Tuple[int, int]): Size of the images.
74
+ segment_index (int): Index of the segment.
75
+ object_detection_tool (Callable): Object detection tool to use.
76
+
77
+ Returns:
78
+ Any: Detections for the segment.
79
+ """
80
+ segment_results: List[Optional[List[Dict[str, Any]]]] = [None] * len(segment_frames)
81
+
82
+ if chunk_length is None:
83
+ step = 1
84
+ elif chunk_length <= 0:
85
+ raise ValueError("chunk_length must be a positive integer or None.")
86
+ else:
87
+ step = chunk_length
88
+
89
+ function_name = ""
90
+
91
+ for idx in range(0, len(segment_frames), step):
92
+ frame_number = idx
93
+ segment_results[idx], function_name = object_detection_tool(
94
+ od_model, prompt, segment_index, frame_number, fine_tune_id, segment_frames
95
+ )
96
+
97
+ transformed_detections = transform_detections(
98
+ segment_results, image_size, segment_index
99
+ )
100
+
101
+ buffer_bytes = frames_to_bytes(segment_frames)
102
+ files = [("video", buffer_bytes)]
103
+ payload = {
104
+ "bboxes": json.dumps(transformed_detections),
105
+ "chunk_length_frames": chunk_length,
106
+ }
107
+ metadata = {"function_name": function_name}
108
+
109
+ segment_detections = send_task_inference_request(
110
+ payload,
111
+ "sam2",
112
+ files=files,
113
+ metadata=metadata,
114
+ )
115
+
116
+ return segment_detections
117
+
118
+
119
+ def transform_detections(
120
+ input_list: List[Optional[List[Dict[str, Any]]]],
121
+ image_size: Tuple[int, ...],
122
+ segment_index: int,
123
+ ) -> List[Optional[Dict[str, Any]]]:
124
+ """
125
+ Transforms raw detections into a standardized format.
126
+
127
+ Args:
128
+ input_list (List[Optional[List[Dict[str, Any]]]]): Raw detections.
129
+ image_size (Tuple[int, int]): Size of the images.
130
+ segment_index (int): Index of the segment.
131
+
132
+ Returns:
133
+ List[Optional[Dict[str, Any]]]: Transformed detections.
134
+ """
135
+ output_list: List[Optional[Dict[str, Any]]] = []
136
+ for frame_idx, frame in enumerate(input_list):
137
+ if frame is not None:
138
+ labels = [detection["label"] for detection in frame]
139
+ bboxes = [
140
+ denormalize_bbox(detection["bbox"], image_size) for detection in frame
141
+ ]
142
+
143
+ output_list.append(
144
+ {
145
+ "labels": labels,
146
+ "bboxes": bboxes,
147
+ }
148
+ )
149
+ else:
150
+ output_list.append(None)
151
+ return output_list
152
+
153
+
154
+ def _calculate_mask_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
155
+ mask1 = mask1.astype(bool)
156
+ mask2 = mask2.astype(bool)
157
+
158
+ intersection = np.sum(np.logical_and(mask1, mask2))
159
+ union = np.sum(np.logical_or(mask1, mask2))
160
+
161
+ if union == 0:
162
+ iou = 0.0
163
+ else:
164
+ iou = intersection / union
165
+
166
+ return iou
167
+
168
+
169
+ def _match_by_iou(
170
+ first_param: List[Dict],
171
+ second_param: List[Dict],
172
+ iou_threshold: float = 0.8,
173
+ ) -> Tuple[List[Dict], Dict[int, int]]:
174
+ max_id = max((item["id"] for item in first_param), default=0)
175
+
176
+ matched_new_item_indices = set()
177
+ id_mapping = {}
178
+
179
+ for new_index, new_item in enumerate(second_param):
180
+ matched_id = None
181
+
182
+ for existing_item in first_param:
183
+ iou = _calculate_mask_iou(
184
+ existing_item["decoded_mask"], new_item["decoded_mask"]
185
+ )
186
+ if iou > iou_threshold:
187
+ matched_id = existing_item["id"]
188
+ matched_new_item_indices.add(new_index)
189
+ id_mapping[new_item["id"]] = matched_id
190
+ break
191
+
192
+ if matched_id:
193
+ new_item["id"] = matched_id
194
+ else:
195
+ max_id += 1
196
+ id_mapping[new_item["id"]] = max_id
197
+ new_item["id"] = max_id
198
+
199
+ unmatched_items = [
200
+ item for i, item in enumerate(second_param) if i not in matched_new_item_indices
201
+ ]
202
+ combined_list = first_param + unmatched_items
203
+
204
+ return combined_list, id_mapping
205
+
206
+
207
+ def _update_ids(detections: List[Dict], id_mapping: Dict[int, int]) -> None:
208
+ for inner_list in detections:
209
+ for detection in inner_list:
210
+ if detection["id"] in id_mapping:
211
+ detection["id"] = id_mapping[detection["id"]]
212
+ else:
213
+ max_new_id = max(id_mapping.values(), default=0)
214
+ detection["id"] = max_new_id + 1
215
+ id_mapping[detection["id"]] = detection["id"]
216
+
217
+
218
+ def _convert_to_2d(detections_per_segment: List[Any]) -> List[Any]:
219
+ result = []
220
+ for i, segment in enumerate(detections_per_segment):
221
+ if i == 0:
222
+ result.extend(segment)
223
+ else:
224
+ result.extend(segment[1:])
225
+ return result
226
+
227
+
228
+ def merge_segments(detections_per_segment: List[Any]) -> List[Any]:
229
+ """
230
+ Merges detections from all segments into a unified result.
231
+
232
+ Args:
233
+ detections_per_segment (List[Any]): List of detections per segment.
234
+
235
+ Returns:
236
+ List[Any]: Merged detections.
237
+ """
238
+ for segment in detections_per_segment:
239
+ for detection in segment:
240
+ for item in detection:
241
+ item["decoded_mask"] = rle_decode_array(item["mask"])
242
+
243
+ for segment_idx in range(len(detections_per_segment) - 1):
244
+ combined_detection, id_mapping = _match_by_iou(
245
+ detections_per_segment[segment_idx][-1],
246
+ detections_per_segment[segment_idx + 1][0],
247
+ )
248
+ _update_ids(detections_per_segment[segment_idx + 1], id_mapping)
249
+
250
+ merged_result = _convert_to_2d(detections_per_segment)
251
+
252
+ return merged_result
253
+
254
+
255
+ def post_process(
256
+ merged_detections: List[Any],
257
+ image_size: Tuple[int, ...],
258
+ ) -> Dict[str, Any]:
259
+ """
260
+ Performs post-processing on merged detections, including NMS and preparing display data.
261
+
262
+ Args:
263
+ merged_detections (List[Any]): Merged detections from all segments.
264
+ image_size (Tuple[int, int]): Size of the images.
265
+
266
+ Returns:
267
+ Dict[str, Any]: Post-processed data including return_data and display_data.
268
+ """
269
+ return_data = []
270
+ for frame_idx, frame in enumerate(merged_detections):
271
+ return_frame_data = []
272
+ for detection in frame:
273
+ label = f"{detection['id']}: {detection['label']}"
274
+ return_frame_data.append(
275
+ {
276
+ "label": label,
277
+ "mask": detection["decoded_mask"],
278
+ "rle": detection["mask"],
279
+ "score": 1.0,
280
+ }
281
+ )
282
+ del detection["decoded_mask"]
283
+ return_data.append(return_frame_data)
284
+
285
+ return_data = add_bboxes_from_masks(return_data)
286
+ return_data = nms(return_data, iou_threshold=0.95)
287
+
288
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
289
+ # Deleted here because we are returning the numpy masks instead
290
+ display_data = []
291
+ for frame in return_data:
292
+ display_frame_data = []
293
+ for obj in frame:
294
+ display_frame_data.append(
295
+ {
296
+ "label": obj["label"],
297
+ "bbox": denormalize_bbox(obj["bbox"], image_size),
298
+ "mask": obj["rle"],
299
+ "score": obj["score"],
300
+ }
301
+ )
302
+ del obj["rle"]
303
+ display_data.append(display_frame_data)
304
+
305
+ return {"return_data": return_data, "display_data": display_data}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.224
3
+ Version: 0.2.225
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -31,7 +31,7 @@ vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj
31
31
  vision_agent/tools/planner_tools.py,sha256=CvaJ2vGM8O_CYvsoSk1avxAMqpIu3tv4C2bY0p1X-X4,13519
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
34
- vision_agent/tools/tools.py,sha256=60S5ItFG9yKzVb8FU8oLFj_aouDg2-4vlieDbSgfPdQ,91306
34
+ vision_agent/tools/tools.py,sha256=cQYO1TfWhm9C_KaU201aTYec-w0m9QoQMzqjxWvQWGU,90770
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
40
40
  vision_agent/utils/sim.py,sha256=znsInUDrsyBi3OlgAlV3rDn5UQQRfJAWXTXm7D7eJA8,9125
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.224.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.224.dist-info/METADATA,sha256=wT49_byW9-Oz6-1eSlP3cW_AFGbWaxtKrYsGB4nT62o,20039
45
- vision_agent-0.2.224.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.224.dist-info/RECORD,,
43
+ vision_agent/utils/video_tracking.py,sha256=EeOiSY8gjvvneuAnv-BO7yOyMBF_-1Irk_lLLOt3bDM,9452
44
+ vision_agent-0.2.225.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
+ vision_agent-0.2.225.dist-info/METADATA,sha256=PzehPaQUIj_3TImCmj1YEFMI1rPkrd6FqcIlXhmWjLE,20039
46
+ vision_agent-0.2.225.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
+ vision_agent-0.2.225.dist-info/RECORD,,