vision-agent 0.2.227__py3-none-any.whl → 0.2.229__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,6 +64,9 @@ from .tools import (
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
66
  custom_object_detection,
67
+ agentic_object_detection,
68
+ agentic_sam2_instance_segmentation,
69
+ agentic_sam2_video_tracking,
67
70
  )
68
71
 
69
72
  __new_tools__ = [
@@ -72,8 +72,7 @@ def send_inference_request(
72
72
 
73
73
  response = _call_post(url, payload, session, files, function_name, is_form)
74
74
 
75
- # TODO: consider making the response schema the same between below two sources
76
- return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
75
+ return response["data"]
77
76
 
78
77
 
79
78
  def send_task_inference_request(
@@ -290,6 +290,14 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.AGENTIC:
294
+ segment_results = agentic_object_detection(
295
+ prompt=prompt,
296
+ image=segment_frames[frame_number],
297
+ fine_tune_id=fine_tune_id,
298
+ )
299
+ function_name = "agentic_object_detection"
300
+
293
301
  elif od_model == ODModels.CUSTOM:
294
302
  segment_results = custom_object_detection(
295
303
  deployment_id=fine_tune_id,
@@ -2140,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2140
2148
  return response
2141
2149
 
2142
2150
 
2151
+ # agentic od tools
2152
+
2153
+
2154
+ def _agentic_object_detection(
2155
+ prompt: str,
2156
+ image: np.ndarray,
2157
+ image_size: Tuple[int, ...],
2158
+ image_bytes: Optional[bytes] = None,
2159
+ fine_tune_id: Optional[str] = None,
2160
+ ) -> Dict[str, Any]:
2161
+ if image_bytes is None:
2162
+ image_bytes = numpy_to_bytes(image)
2163
+
2164
+ files = [("image", image_bytes)]
2165
+ payload = {
2166
+ "prompts": [s.strip() for s in prompt.split(",")],
2167
+ "model": "agentic",
2168
+ }
2169
+ metadata = {"function_name": "agentic_object_detection"}
2170
+
2171
+ if fine_tune_id is not None:
2172
+ landing_api = LandingPublicAPI()
2173
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
2174
+ if status is not JobStatus.SUCCEEDED:
2175
+ raise FineTuneModelIsNotReady(
2176
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
2177
+ )
2178
+
2179
+ # we can only execute fine-tuned models with florence2
2180
+ payload = {
2181
+ "prompts": payload["prompts"],
2182
+ "jobId": fine_tune_id,
2183
+ "model": "florence2",
2184
+ }
2185
+
2186
+ detections = send_task_inference_request(
2187
+ payload,
2188
+ "text-to-object-detection",
2189
+ files=files,
2190
+ metadata=metadata,
2191
+ )
2192
+
2193
+ # get the first frame
2194
+ bboxes = detections[0]
2195
+ bboxes_formatted = [
2196
+ {
2197
+ "label": bbox["label"],
2198
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
2199
+ "score": bbox["score"],
2200
+ }
2201
+ for bbox in bboxes
2202
+ ]
2203
+ display_data = [
2204
+ {
2205
+ "label": bbox["label"],
2206
+ "bbox": bbox["bounding_box"],
2207
+ "score": bbox["score"],
2208
+ }
2209
+ for bbox in bboxes
2210
+ ]
2211
+ return {
2212
+ "files": files,
2213
+ "return_data": bboxes_formatted,
2214
+ "display_data": display_data,
2215
+ }
2216
+
2217
+
2218
+ def agentic_object_detection(
2219
+ prompt: str,
2220
+ image: np.ndarray,
2221
+ fine_tune_id: Optional[str] = None,
2222
+ ) -> List[Dict[str, Any]]:
2223
+ """'agentic_object_detection' is a tool that can detect and count multiple objects
2224
+ given a text prompt such as category names or referring expressions on images. The
2225
+ categories in text prompt are separated by commas. It returns a list of bounding
2226
+ boxes with normalized coordinates, label names and associated probability scores.
2227
+
2228
+ Parameters:
2229
+ prompt (str): The prompt to ground to the image.
2230
+ image (np.ndarray): The image to ground the prompt to.
2231
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2232
+ fine-tuned model ID here to use it.
2233
+
2234
+ Returns:
2235
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
2236
+ bounding box of the detected objects with normalized coordinates between 0
2237
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
2238
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
2239
+ bounding box.
2240
+
2241
+ Example
2242
+ -------
2243
+ >>> agentic_object_detection("car", image)
2244
+ [
2245
+ {'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
2246
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
2247
+ ]
2248
+ """
2249
+
2250
+ image_size = image.shape[:2]
2251
+ if image_size[0] < 1 or image_size[1] < 1:
2252
+ return []
2253
+
2254
+ ret = _agentic_object_detection(
2255
+ prompt, image, image_size, fine_tune_id=fine_tune_id
2256
+ )
2257
+
2258
+ _display_tool_trace(
2259
+ agentic_object_detection.__name__,
2260
+ {"prompts": prompt},
2261
+ ret["display_data"],
2262
+ ret["files"],
2263
+ )
2264
+ return ret["return_data"] # type: ignore
2265
+
2266
+
2267
+ def agentic_sam2_instance_segmentation(
2268
+ prompt: str, image: np.ndarray
2269
+ ) -> List[Dict[str, Any]]:
2270
+ """'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
2271
+ instances of objects given a text prompt such as category names or referring
2272
+ expressions on images. The categories in text prompt are separated by commas. It
2273
+ returns a list of bounding boxes with normalized coordinates, label names, masks
2274
+ and associated probability scores.
2275
+
2276
+ Parameters:
2277
+ prompt (str): The object that needs to be counted.
2278
+ image (np.ndarray): The image that contains multiple instances of the object.
2279
+
2280
+ Returns:
2281
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2282
+ bounding box, and mask of the detected objects with normalized coordinates
2283
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2284
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2285
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2286
+ the background.
2287
+
2288
+ Example
2289
+ -------
2290
+ >>> agentic_sam2_instance_segmentation("flower", image)
2291
+ [
2292
+ {
2293
+ 'score': 0.49,
2294
+ 'label': 'flower',
2295
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2296
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2297
+ [0, 0, 0, ..., 0, 0, 0],
2298
+ ...,
2299
+ [0, 0, 0, ..., 0, 0, 0],
2300
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2301
+ },
2302
+ ]
2303
+ """
2304
+
2305
+ od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
2306
+ seg_ret = _sam2(
2307
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
2308
+ )
2309
+
2310
+ _display_tool_trace(
2311
+ agentic_sam2_instance_segmentation.__name__,
2312
+ {
2313
+ "prompts": prompt,
2314
+ },
2315
+ seg_ret["display_data"],
2316
+ seg_ret["files"],
2317
+ )
2318
+
2319
+ return seg_ret["return_data"] # type: ignore
2320
+
2321
+
2322
+ def agentic_sam2_video_tracking(
2323
+ prompt: str,
2324
+ frames: List[np.ndarray],
2325
+ chunk_length: Optional[int] = 10,
2326
+ fine_tune_id: Optional[str] = None,
2327
+ ) -> List[List[Dict[str, Any]]]:
2328
+ """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
2329
+ objects in a video given a text prompt such as category names or referring
2330
+ expressions. The categories in the text prompt are separated by commas. It returns
2331
+ a list of bounding boxes, label names, masks and associated probability scores and
2332
+ is useful for tracking and counting without duplicating counts.
2333
+
2334
+ Parameters:
2335
+ prompt (str): The prompt to ground to the image.
2336
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
2337
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
2338
+ to find new objects.
2339
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2340
+ fine-tuned model ID here to use it.
2341
+
2342
+ Returns:
2343
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
2344
+ label, segmentation mask and bounding boxes. The outer list represents each
2345
+ frame and the inner list is the entities per frame. The detected objects
2346
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
2347
+ and ymin are the coordinates of the top-left and xmax and ymax are the
2348
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
2349
+ numpy array where 1 indicates the object and 0 indicates the background.
2350
+ The label names are prefixed with their ID represent the total count.
2351
+
2352
+ Example
2353
+ -------
2354
+ >>> agentic_sam2_video_tracking("dinosaur", frames)
2355
+ [
2356
+ [
2357
+ {
2358
+ 'label': '0: dinosaur',
2359
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2360
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2361
+ [0, 0, 0, ..., 0, 0, 0],
2362
+ ...,
2363
+ [0, 0, 0, ..., 0, 0, 0],
2364
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2365
+ },
2366
+ ],
2367
+ ...
2368
+ ]
2369
+ """
2370
+
2371
+ ret = od_sam2_video_tracking(
2372
+ ODModels.AGENTIC,
2373
+ prompt=prompt,
2374
+ frames=frames,
2375
+ chunk_length=chunk_length,
2376
+ fine_tune_id=fine_tune_id,
2377
+ )
2378
+ _display_tool_trace(
2379
+ agentic_sam2_video_tracking.__name__,
2380
+ {},
2381
+ ret["display_data"],
2382
+ ret["files"],
2383
+ )
2384
+ return ret["return_data"] # type: ignore
2385
+
2386
+
2143
2387
  def minimum_distance(
2144
2388
  det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
2145
2389
  ) -> float:
@@ -17,6 +17,7 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ AGENTIC = "agentic"
20
21
  CUSTOM = "custom"
21
22
 
22
23
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.227
3
+ Version: 0.2.229
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=fcucnAzr5Hue9xSqpBgA7RcRJP2CgAgQJ31p_R5lg-I,2794
29
+ vision_agent/tools/__init__.py,sha256=8VpAC8zEk8OwcMLcTn7gEAfw6ihqlsEfzjEaW5yd5-4,2897
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
31
  vision_agent/tools/planner_tools.py,sha256=qQvPuCif-KbFi7KsXKkTCfpgEQEJJ6oq6WB3gOuG2Xg,13686
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tool_utils.py,sha256=q9cqXO2AvigUdO1krjnOy8o0goYhgS6eILl6-F5Kxyk,10211
34
- vision_agent/tools/tools.py,sha256=36f0qAhQfA5lDhYv5BKpHfHgBVEBgOD-XNVHG5K4HLY,96619
33
+ vision_agent/tools/tool_utils.py,sha256=kXB0F-HwmiChpQgKk7tMo-Acsl3UXxjaJV9mYo_q6n4,10076
34
+ vision_agent/tools/tools.py,sha256=M_kk17Yr5c6ODKet26GcxZAlGDwl0AwMMD4wCrBhR6Y,105157
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,8 +40,8 @@ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16N
40
40
  vision_agent/utils/sim.py,sha256=qr-6UWAxxGwtwIAKZjZCY_pu9VwBI_TTB8bfrGsaABg,9282
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent/utils/video_tracking.py,sha256=7ZiFBqQRTid5ytPmkrAGQUiVMr-twzib8Ha2hN3JsR0,9474
44
- vision_agent-0.2.227.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
- vision_agent-0.2.227.dist-info/METADATA,sha256=qFefkLzCo7G98LyhIPqYzPOUv5nyvOK84DJvUWmeqcc,20039
46
- vision_agent-0.2.227.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
- vision_agent-0.2.227.dist-info/RECORD,,
43
+ vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
44
+ vision_agent-0.2.229.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
45
+ vision_agent-0.2.229.dist-info/METADATA,sha256=ver5sB_NI_dkek1GxY9GsvktACS1Rl6-tgrr_B5p1Zc,20039
46
+ vision_agent-0.2.229.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
47
+ vision_agent-0.2.229.dist-info/RECORD,,