vision-agent 0.2.227__tar.gz → 0.2.229__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. {vision_agent-0.2.227 → vision_agent-0.2.229}/PKG-INFO +1 -1
  2. {vision_agent-0.2.227 → vision_agent-0.2.229}/pyproject.toml +1 -1
  3. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/__init__.py +3 -0
  4. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/tool_utils.py +1 -2
  5. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/tools.py +244 -0
  6. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/video_tracking.py +1 -0
  7. {vision_agent-0.2.227 → vision_agent-0.2.229}/LICENSE +0 -0
  8. {vision_agent-0.2.227 → vision_agent-0.2.229}/README.md +0 -0
  9. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/.sim_tools/df.csv +0 -0
  10. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/.sim_tools/embs.npy +0 -0
  11. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/__init__.py +0 -0
  12. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/README.md +0 -0
  13. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/__init__.py +0 -0
  14. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/agent.py +0 -0
  15. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/agent_utils.py +0 -0
  16. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/types.py +0 -0
  17. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent.py +0 -0
  18. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_coder.py +0 -0
  19. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  20. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  21. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  22. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_planner.py +0 -0
  23. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  24. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  25. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  26. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_prompts.py +0 -0
  27. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  28. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/agent/vision_agent_v2.py +0 -0
  29. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/clients/__init__.py +0 -0
  30. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/clients/http.py +0 -0
  31. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/clients/landing_public_api.py +0 -0
  32. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/fonts/__init__.py +0 -0
  33. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  34. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/lmm/__init__.py +0 -0
  35. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/lmm/lmm.py +0 -0
  36. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/lmm/types.py +0 -0
  37. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/meta_tools.py +0 -0
  38. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/planner_tools.py +0 -0
  39. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/prompts.py +0 -0
  40. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/tools/tools_types.py +0 -0
  41. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/__init__.py +0 -0
  42. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/exceptions.py +0 -0
  43. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/execute.py +0 -0
  44. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/image_utils.py +0 -0
  45. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/sim.py +0 -0
  46. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/type_defs.py +0 -0
  47. {vision_agent-0.2.227 → vision_agent-0.2.229}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.227
3
+ Version: 0.2.229
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.227"
7
+ version = "0.2.229"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -64,6 +64,9 @@ from .tools import (
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
66
  custom_object_detection,
67
+ agentic_object_detection,
68
+ agentic_sam2_instance_segmentation,
69
+ agentic_sam2_video_tracking,
67
70
  )
68
71
 
69
72
  __new_tools__ = [
@@ -72,8 +72,7 @@ def send_inference_request(
72
72
 
73
73
  response = _call_post(url, payload, session, files, function_name, is_form)
74
74
 
75
- # TODO: consider making the response schema the same between below two sources
76
- return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
75
+ return response["data"]
77
76
 
78
77
 
79
78
  def send_task_inference_request(
@@ -290,6 +290,14 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.AGENTIC:
294
+ segment_results = agentic_object_detection(
295
+ prompt=prompt,
296
+ image=segment_frames[frame_number],
297
+ fine_tune_id=fine_tune_id,
298
+ )
299
+ function_name = "agentic_object_detection"
300
+
293
301
  elif od_model == ODModels.CUSTOM:
294
302
  segment_results = custom_object_detection(
295
303
  deployment_id=fine_tune_id,
@@ -2140,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2140
2148
  return response
2141
2149
 
2142
2150
 
2151
+ # agentic od tools
2152
+
2153
+
2154
+ def _agentic_object_detection(
2155
+ prompt: str,
2156
+ image: np.ndarray,
2157
+ image_size: Tuple[int, ...],
2158
+ image_bytes: Optional[bytes] = None,
2159
+ fine_tune_id: Optional[str] = None,
2160
+ ) -> Dict[str, Any]:
2161
+ if image_bytes is None:
2162
+ image_bytes = numpy_to_bytes(image)
2163
+
2164
+ files = [("image", image_bytes)]
2165
+ payload = {
2166
+ "prompts": [s.strip() for s in prompt.split(",")],
2167
+ "model": "agentic",
2168
+ }
2169
+ metadata = {"function_name": "agentic_object_detection"}
2170
+
2171
+ if fine_tune_id is not None:
2172
+ landing_api = LandingPublicAPI()
2173
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
2174
+ if status is not JobStatus.SUCCEEDED:
2175
+ raise FineTuneModelIsNotReady(
2176
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
2177
+ )
2178
+
2179
+ # we can only execute fine-tuned models with florence2
2180
+ payload = {
2181
+ "prompts": payload["prompts"],
2182
+ "jobId": fine_tune_id,
2183
+ "model": "florence2",
2184
+ }
2185
+
2186
+ detections = send_task_inference_request(
2187
+ payload,
2188
+ "text-to-object-detection",
2189
+ files=files,
2190
+ metadata=metadata,
2191
+ )
2192
+
2193
+ # get the first frame
2194
+ bboxes = detections[0]
2195
+ bboxes_formatted = [
2196
+ {
2197
+ "label": bbox["label"],
2198
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
2199
+ "score": bbox["score"],
2200
+ }
2201
+ for bbox in bboxes
2202
+ ]
2203
+ display_data = [
2204
+ {
2205
+ "label": bbox["label"],
2206
+ "bbox": bbox["bounding_box"],
2207
+ "score": bbox["score"],
2208
+ }
2209
+ for bbox in bboxes
2210
+ ]
2211
+ return {
2212
+ "files": files,
2213
+ "return_data": bboxes_formatted,
2214
+ "display_data": display_data,
2215
+ }
2216
+
2217
+
2218
+ def agentic_object_detection(
2219
+ prompt: str,
2220
+ image: np.ndarray,
2221
+ fine_tune_id: Optional[str] = None,
2222
+ ) -> List[Dict[str, Any]]:
2223
+ """'agentic_object_detection' is a tool that can detect and count multiple objects
2224
+ given a text prompt such as category names or referring expressions on images. The
2225
+ categories in text prompt are separated by commas. It returns a list of bounding
2226
+ boxes with normalized coordinates, label names and associated probability scores.
2227
+
2228
+ Parameters:
2229
+ prompt (str): The prompt to ground to the image.
2230
+ image (np.ndarray): The image to ground the prompt to.
2231
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2232
+ fine-tuned model ID here to use it.
2233
+
2234
+ Returns:
2235
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
2236
+ bounding box of the detected objects with normalized coordinates between 0
2237
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
2238
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
2239
+ bounding box.
2240
+
2241
+ Example
2242
+ -------
2243
+ >>> agentic_object_detection("car", image)
2244
+ [
2245
+ {'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
2246
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
2247
+ ]
2248
+ """
2249
+
2250
+ image_size = image.shape[:2]
2251
+ if image_size[0] < 1 or image_size[1] < 1:
2252
+ return []
2253
+
2254
+ ret = _agentic_object_detection(
2255
+ prompt, image, image_size, fine_tune_id=fine_tune_id
2256
+ )
2257
+
2258
+ _display_tool_trace(
2259
+ agentic_object_detection.__name__,
2260
+ {"prompts": prompt},
2261
+ ret["display_data"],
2262
+ ret["files"],
2263
+ )
2264
+ return ret["return_data"] # type: ignore
2265
+
2266
+
2267
+ def agentic_sam2_instance_segmentation(
2268
+ prompt: str, image: np.ndarray
2269
+ ) -> List[Dict[str, Any]]:
2270
+ """'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
2271
+ instances of objects given a text prompt such as category names or referring
2272
+ expressions on images. The categories in text prompt are separated by commas. It
2273
+ returns a list of bounding boxes with normalized coordinates, label names, masks
2274
+ and associated probability scores.
2275
+
2276
+ Parameters:
2277
+ prompt (str): The object that needs to be counted.
2278
+ image (np.ndarray): The image that contains multiple instances of the object.
2279
+
2280
+ Returns:
2281
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2282
+ bounding box, and mask of the detected objects with normalized coordinates
2283
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2284
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2285
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2286
+ the background.
2287
+
2288
+ Example
2289
+ -------
2290
+ >>> agentic_sam2_instance_segmentation("flower", image)
2291
+ [
2292
+ {
2293
+ 'score': 0.49,
2294
+ 'label': 'flower',
2295
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2296
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2297
+ [0, 0, 0, ..., 0, 0, 0],
2298
+ ...,
2299
+ [0, 0, 0, ..., 0, 0, 0],
2300
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2301
+ },
2302
+ ]
2303
+ """
2304
+
2305
+ od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
2306
+ seg_ret = _sam2(
2307
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
2308
+ )
2309
+
2310
+ _display_tool_trace(
2311
+ agentic_sam2_instance_segmentation.__name__,
2312
+ {
2313
+ "prompts": prompt,
2314
+ },
2315
+ seg_ret["display_data"],
2316
+ seg_ret["files"],
2317
+ )
2318
+
2319
+ return seg_ret["return_data"] # type: ignore
2320
+
2321
+
2322
+ def agentic_sam2_video_tracking(
2323
+ prompt: str,
2324
+ frames: List[np.ndarray],
2325
+ chunk_length: Optional[int] = 10,
2326
+ fine_tune_id: Optional[str] = None,
2327
+ ) -> List[List[Dict[str, Any]]]:
2328
+ """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
2329
+ objects in a video given a text prompt such as category names or referring
2330
+ expressions. The categories in the text prompt are separated by commas. It returns
2331
+ a list of bounding boxes, label names, masks and associated probability scores and
2332
+ is useful for tracking and counting without duplicating counts.
2333
+
2334
+ Parameters:
2335
+ prompt (str): The prompt to ground to the image.
2336
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
2337
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
2338
+ to find new objects.
2339
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2340
+ fine-tuned model ID here to use it.
2341
+
2342
+ Returns:
2343
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
2344
+ label, segmentation mask and bounding boxes. The outer list represents each
2345
+ frame and the inner list is the entities per frame. The detected objects
2346
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
2347
+ and ymin are the coordinates of the top-left and xmax and ymax are the
2348
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
2349
+ numpy array where 1 indicates the object and 0 indicates the background.
2350
+ The label names are prefixed with their ID represent the total count.
2351
+
2352
+ Example
2353
+ -------
2354
+ >>> agentic_sam2_video_tracking("dinosaur", frames)
2355
+ [
2356
+ [
2357
+ {
2358
+ 'label': '0: dinosaur',
2359
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2360
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2361
+ [0, 0, 0, ..., 0, 0, 0],
2362
+ ...,
2363
+ [0, 0, 0, ..., 0, 0, 0],
2364
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2365
+ },
2366
+ ],
2367
+ ...
2368
+ ]
2369
+ """
2370
+
2371
+ ret = od_sam2_video_tracking(
2372
+ ODModels.AGENTIC,
2373
+ prompt=prompt,
2374
+ frames=frames,
2375
+ chunk_length=chunk_length,
2376
+ fine_tune_id=fine_tune_id,
2377
+ )
2378
+ _display_tool_trace(
2379
+ agentic_sam2_video_tracking.__name__,
2380
+ {},
2381
+ ret["display_data"],
2382
+ ret["files"],
2383
+ )
2384
+ return ret["return_data"] # type: ignore
2385
+
2386
+
2143
2387
  def minimum_distance(
2144
2388
  det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
2145
2389
  ) -> float:
@@ -17,6 +17,7 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ AGENTIC = "agentic"
20
21
  CUSTOM = "custom"
21
22
 
22
23
 
File without changes
File without changes