vision-agent 0.2.227__py3-none-any.whl → 0.2.229__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -64,6 +64,9 @@ from .tools import (
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
66
  custom_object_detection,
67
+ agentic_object_detection,
68
+ agentic_sam2_instance_segmentation,
69
+ agentic_sam2_video_tracking,
67
70
  )
68
71
 
69
72
  __new_tools__ = [
@@ -72,8 +72,7 @@ def send_inference_request(
72
72
 
73
73
  response = _call_post(url, payload, session, files, function_name, is_form)
74
74
 
75
- # TODO: consider making the response schema the same between below two sources
76
- return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
75
+ return response["data"]
77
76
 
78
77
 
79
78
  def send_task_inference_request(
@@ -290,6 +290,14 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.AGENTIC:
294
+ segment_results = agentic_object_detection(
295
+ prompt=prompt,
296
+ image=segment_frames[frame_number],
297
+ fine_tune_id=fine_tune_id,
298
+ )
299
+ function_name = "agentic_object_detection"
300
+
293
301
  elif od_model == ODModels.CUSTOM:
294
302
  segment_results = custom_object_detection(
295
303
  deployment_id=fine_tune_id,
@@ -2140,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2140
2148
  return response
2141
2149
 
2142
2150
 
2151
+ # agentic od tools
2152
+
2153
+
2154
+ def _agentic_object_detection(
2155
+ prompt: str,
2156
+ image: np.ndarray,
2157
+ image_size: Tuple[int, ...],
2158
+ image_bytes: Optional[bytes] = None,
2159
+ fine_tune_id: Optional[str] = None,
2160
+ ) -> Dict[str, Any]:
2161
+ if image_bytes is None:
2162
+ image_bytes = numpy_to_bytes(image)
2163
+
2164
+ files = [("image", image_bytes)]
2165
+ payload = {
2166
+ "prompts": [s.strip() for s in prompt.split(",")],
2167
+ "model": "agentic",
2168
+ }
2169
+ metadata = {"function_name": "agentic_object_detection"}
2170
+
2171
+ if fine_tune_id is not None:
2172
+ landing_api = LandingPublicAPI()
2173
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
2174
+ if status is not JobStatus.SUCCEEDED:
2175
+ raise FineTuneModelIsNotReady(
2176
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
2177
+ )
2178
+
2179
+ # we can only execute fine-tuned models with florence2
2180
+ payload = {
2181
+ "prompts": payload["prompts"],
2182
+ "jobId": fine_tune_id,
2183
+ "model": "florence2",
2184
+ }
2185
+
2186
+ detections = send_task_inference_request(
2187
+ payload,
2188
+ "text-to-object-detection",
2189
+ files=files,
2190
+ metadata=metadata,
2191
+ )
2192
+
2193
+ # get the first frame
2194
+ bboxes = detections[0]
2195
+ bboxes_formatted = [
2196
+ {
2197
+ "label": bbox["label"],
2198
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
2199
+ "score": bbox["score"],
2200
+ }
2201
+ for bbox in bboxes
2202
+ ]
2203
+ display_data = [
2204
+ {
2205
+ "label": bbox["label"],
2206
+ "bbox": bbox["bounding_box"],
2207
+ "score": bbox["score"],
2208
+ }
2209
+ for bbox in bboxes
2210
+ ]
2211
+ return {
2212
+ "files": files,
2213
+ "return_data": bboxes_formatted,
2214
+ "display_data": display_data,
2215
+ }
2216
+
2217
+
2218
+ def agentic_object_detection(
2219
+ prompt: str,
2220
+ image: np.ndarray,
2221
+ fine_tune_id: Optional[str] = None,
2222
+ ) -> List[Dict[str, Any]]:
2223
+ """'agentic_object_detection' is a tool that can detect and count multiple objects
2224
+ given a text prompt such as category names or referring expressions on images. The
2225
+ categories in text prompt are separated by commas. It returns a list of bounding
2226
+ boxes with normalized coordinates, label names and associated probability scores.
2227
+
2228
+ Parameters:
2229
+ prompt (str): The prompt to ground to the image.
2230
+ image (np.ndarray): The image to ground the prompt to.
2231
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2232
+ fine-tuned model ID here to use it.
2233
+
2234
+ Returns:
2235
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
2236
+ bounding box of the detected objects with normalized coordinates between 0
2237
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
2238
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
2239
+ bounding box.
2240
+
2241
+ Example
2242
+ -------
2243
+ >>> agentic_object_detection("car", image)
2244
+ [
2245
+ {'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
2246
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
2247
+ ]
2248
+ """
2249
+
2250
+ image_size = image.shape[:2]
2251
+ if image_size[0] < 1 or image_size[1] < 1:
2252
+ return []
2253
+
2254
+ ret = _agentic_object_detection(
2255
+ prompt, image, image_size, fine_tune_id=fine_tune_id
2256
+ )
2257
+
2258
+ _display_tool_trace(
2259
+ agentic_object_detection.__name__,
2260
+ {"prompts": prompt},
2261
+ ret["display_data"],
2262
+ ret["files"],
2263
+ )
2264
+ return ret["return_data"] # type: ignore
2265
+
2266
+
2267
+ def agentic_sam2_instance_segmentation(
2268
+ prompt: str, image: np.ndarray
2269
+ ) -> List[Dict[str, Any]]:
2270
+ """'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
2271
+ instances of objects given a text prompt such as category names or referring
2272
+ expressions on images. The categories in text prompt are separated by commas. It
2273
+ returns a list of bounding boxes with normalized coordinates, label names, masks
2274
+ and associated probability scores.
2275
+
2276
+ Parameters:
2277
+ prompt (str): The object that needs to be counted.
2278
+ image (np.ndarray): The image that contains multiple instances of the object.
2279
+
2280
+ Returns:
2281
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2282
+ bounding box, and mask of the detected objects with normalized coordinates
2283
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2284
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2285
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2286
+ the background.
2287
+
2288
+ Example
2289
+ -------
2290
+ >>> agentic_sam2_instance_segmentation("flower", image)
2291
+ [
2292
+ {
2293
+ 'score': 0.49,
2294
+ 'label': 'flower',
2295
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2296
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2297
+ [0, 0, 0, ..., 0, 0, 0],
2298
+ ...,
2299
+ [0, 0, 0, ..., 0, 0, 0],
2300
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2301
+ },
2302
+ ]
2303
+ """
2304
+
2305
+ od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
2306
+ seg_ret = _sam2(
2307
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
2308
+ )
2309
+
2310
+ _display_tool_trace(
2311
+ agentic_sam2_instance_segmentation.__name__,
2312
+ {
2313
+ "prompts": prompt,
2314
+ },
2315
+ seg_ret["display_data"],
2316
+ seg_ret["files"],
2317
+ )
2318
+
2319
+ return seg_ret["return_data"] # type: ignore
2320
+
2321
+
2322
+ def agentic_sam2_video_tracking(
2323
+ prompt: str,
2324
+ frames: List[np.ndarray],
2325
+ chunk_length: Optional[int] = 10,
2326
+ fine_tune_id: Optional[str] = None,
2327
+ ) -> List[List[Dict[str, Any]]]:
2328
+ """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
2329
+ objects in a video given a text prompt such as category names or referring
2330
+ expressions. The categories in the text prompt are separated by commas. It returns
2331
+ a list of bounding boxes, label names, masks and associated probability scores and
2332
+ is useful for tracking and counting without duplicating counts.
2333
+
2334
+ Parameters:
2335
+ prompt (str): The prompt to ground to the image.
2336
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
2337
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
2338
+ to find new objects.
2339
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2340
+ fine-tuned model ID here to use it.
2341
+
2342
+ Returns:
2343
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
2344
+ label, segmentation mask and bounding boxes. The outer list represents each
2345
+ frame and the inner list is the entities per frame. The detected objects
2346
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
2347
+ and ymin are the coordinates of the top-left and xmax and ymax are the
2348
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
2349
+ numpy array where 1 indicates the object and 0 indicates the background.
2350
+ The label names are prefixed with their ID represent the total count.
2351
+
2352
+ Example
2353
+ -------
2354
+ >>> agentic_sam2_video_tracking("dinosaur", frames)
2355
+ [
2356
+ [
2357
+ {
2358
+ 'label': '0: dinosaur',
2359
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2360
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2361
+ [0, 0, 0, ..., 0, 0, 0],
2362
+ ...,
2363
+ [0, 0, 0, ..., 0, 0, 0],
2364
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2365
+ },
2366
+ ],
2367
+ ...
2368
+ ]
2369
+ """
2370
+
2371
+ ret = od_sam2_video_tracking(
2372
+ ODModels.AGENTIC,
2373
+ prompt=prompt,
2374
+ frames=frames,
2375
+ chunk_length=chunk_length,
2376
+ fine_tune_id=fine_tune_id,
2377
+ )
2378
+ _display_tool_trace(
2379
+ agentic_sam2_video_tracking.__name__,
2380
+ {},
2381
+ ret["display_data"],
2382
+ ret["files"],
2383
+ )
2384
+ return ret["return_data"] # type: ignore
2385
+
2386
+
2143
2387
  def minimum_distance(
2144
2388
  det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
2145
2389
  ) -> float:
@@ -17,6 +17,7 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ AGENTIC = "agentic"
20
21
  CUSTOM = "custom"
21
22
 
22
23
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.227
3
+ Version: 0.2.229
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
29
+ vision_agent/tools/__init__.py,sha256=8VpAC8zEk8OwcMLcTn7gEAfw6ihqlsEfzjEaW5yd5-4,2897
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
34
- vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
33
+ vision_agent/tools/tool_utils.py,sha256=kXB0F-HwmiChpQgKk7tMo-Acsl3UXxjaJV9mYo_q6n4,10076
34
+ vision_agent/tools/tools.py,sha256=M_kk17Yr5c6ODKet26GcxZAlGDwl0AwMMD4wCrBhR6Y,105157
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
40
40
  vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
44
- vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
- vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
46
- vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
- vision_agent-0.2.227.dist-info/RECORD,,
43
+ vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
44
+ vision_agent-0.2.229.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
+ vision_agent-0.2.229.dist-info/METADATA,sha256=ver5sB_NI_dkek1GxY9GsvktACS1Rl6-tgrr_B5p1Zc,20039
46
+ vision_agent-0.2.229.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
+ vision_agent-0.2.229.dist-info/RECORD,,