vision-agent 0.2.227__tar.gz → 0.2.228__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.227 → vision_agent-0.2.228}/PKG-INFO +1 -1
- {vision_agent-0.2.227 → vision_agent-0.2.228}/pyproject.toml +1 -1
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/__init__.py +3 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/tools.py +244 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/video_tracking.py +1 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/LICENSE +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/README.md +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/.sim_tools/df.csv +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/planner_tools.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/utils/video.py +0 -0
@@ -290,6 +290,14 @@ def od_sam2_video_tracking(
|
|
290
290
|
)
|
291
291
|
function_name = "florence2_object_detection"
|
292
292
|
|
293
|
+
elif od_model == ODModels.AGENTIC:
|
294
|
+
segment_results = agentic_object_detection(
|
295
|
+
prompt=prompt,
|
296
|
+
image=segment_frames[frame_number],
|
297
|
+
fine_tune_id=fine_tune_id,
|
298
|
+
)
|
299
|
+
function_name = "agentic_object_detection"
|
300
|
+
|
293
301
|
elif od_model == ODModels.CUSTOM:
|
294
302
|
segment_results = custom_object_detection(
|
295
303
|
deployment_id=fine_tune_id,
|
@@ -2140,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
|
|
2140
2148
|
return response
|
2141
2149
|
|
2142
2150
|
|
2151
|
+
# agentic od tools
|
2152
|
+
|
2153
|
+
|
2154
|
+
def _agentic_object_detection(
|
2155
|
+
prompt: str,
|
2156
|
+
image: np.ndarray,
|
2157
|
+
image_size: Tuple[int, ...],
|
2158
|
+
image_bytes: Optional[bytes] = None,
|
2159
|
+
fine_tune_id: Optional[str] = None,
|
2160
|
+
) -> Dict[str, Any]:
|
2161
|
+
if image_bytes is None:
|
2162
|
+
image_bytes = numpy_to_bytes(image)
|
2163
|
+
|
2164
|
+
files = [("image", image_bytes)]
|
2165
|
+
payload = {
|
2166
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
2167
|
+
"model": "agentic",
|
2168
|
+
}
|
2169
|
+
metadata = {"function_name": "agentic_object_detection"}
|
2170
|
+
|
2171
|
+
if fine_tune_id is not None:
|
2172
|
+
landing_api = LandingPublicAPI()
|
2173
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
2174
|
+
if status is not JobStatus.SUCCEEDED:
|
2175
|
+
raise FineTuneModelIsNotReady(
|
2176
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
2177
|
+
)
|
2178
|
+
|
2179
|
+
# we can only execute fine-tuned models with florence2
|
2180
|
+
payload = {
|
2181
|
+
"prompts": payload["prompts"],
|
2182
|
+
"jobId": fine_tune_id,
|
2183
|
+
"model": "florence2",
|
2184
|
+
}
|
2185
|
+
|
2186
|
+
detections = send_task_inference_request(
|
2187
|
+
payload,
|
2188
|
+
"text-to-object-detection",
|
2189
|
+
files=files,
|
2190
|
+
metadata=metadata,
|
2191
|
+
)
|
2192
|
+
|
2193
|
+
# get the first frame
|
2194
|
+
bboxes = detections[0]
|
2195
|
+
bboxes_formatted = [
|
2196
|
+
{
|
2197
|
+
"label": bbox["label"],
|
2198
|
+
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
2199
|
+
"score": bbox["score"],
|
2200
|
+
}
|
2201
|
+
for bbox in bboxes
|
2202
|
+
]
|
2203
|
+
display_data = [
|
2204
|
+
{
|
2205
|
+
"label": bbox["label"],
|
2206
|
+
"bbox": bbox["bounding_box"],
|
2207
|
+
"score": bbox["score"],
|
2208
|
+
}
|
2209
|
+
for bbox in bboxes
|
2210
|
+
]
|
2211
|
+
return {
|
2212
|
+
"files": files,
|
2213
|
+
"return_data": bboxes_formatted,
|
2214
|
+
"display_data": display_data,
|
2215
|
+
}
|
2216
|
+
|
2217
|
+
|
2218
|
+
def agentic_object_detection(
|
2219
|
+
prompt: str,
|
2220
|
+
image: np.ndarray,
|
2221
|
+
fine_tune_id: Optional[str] = None,
|
2222
|
+
) -> List[Dict[str, Any]]:
|
2223
|
+
"""'agentic_object_detection' is a tool that can detect and count multiple objects
|
2224
|
+
given a text prompt such as category names or referring expressions on images. The
|
2225
|
+
categories in text prompt are separated by commas. It returns a list of bounding
|
2226
|
+
boxes with normalized coordinates, label names and associated probability scores.
|
2227
|
+
|
2228
|
+
Parameters:
|
2229
|
+
prompt (str): The prompt to ground to the image.
|
2230
|
+
image (np.ndarray): The image to ground the prompt to.
|
2231
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
2232
|
+
fine-tuned model ID here to use it.
|
2233
|
+
|
2234
|
+
Returns:
|
2235
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
2236
|
+
bounding box of the detected objects with normalized coordinates between 0
|
2237
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
2238
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
2239
|
+
bounding box.
|
2240
|
+
|
2241
|
+
Example
|
2242
|
+
-------
|
2243
|
+
>>> agentic_object_detection("car", image)
|
2244
|
+
[
|
2245
|
+
{'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
2246
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
2247
|
+
]
|
2248
|
+
"""
|
2249
|
+
|
2250
|
+
image_size = image.shape[:2]
|
2251
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
2252
|
+
return []
|
2253
|
+
|
2254
|
+
ret = _agentic_object_detection(
|
2255
|
+
prompt, image, image_size, fine_tune_id=fine_tune_id
|
2256
|
+
)
|
2257
|
+
|
2258
|
+
_display_tool_trace(
|
2259
|
+
agentic_object_detection.__name__,
|
2260
|
+
{"prompts": prompt},
|
2261
|
+
ret["display_data"],
|
2262
|
+
ret["files"],
|
2263
|
+
)
|
2264
|
+
return ret["return_data"] # type: ignore
|
2265
|
+
|
2266
|
+
|
2267
|
+
def agentic_sam2_instance_segmentation(
|
2268
|
+
prompt: str, image: np.ndarray
|
2269
|
+
) -> List[Dict[str, Any]]:
|
2270
|
+
"""'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
|
2271
|
+
instances of objects given a text prompt such as category names or referring
|
2272
|
+
expressions on images. The categories in text prompt are separated by commas. It
|
2273
|
+
returns a list of bounding boxes with normalized coordinates, label names, masks
|
2274
|
+
and associated probability scores.
|
2275
|
+
|
2276
|
+
Parameters:
|
2277
|
+
prompt (str): The object that needs to be counted.
|
2278
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
2279
|
+
|
2280
|
+
Returns:
|
2281
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2282
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
2283
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2284
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2285
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2286
|
+
the background.
|
2287
|
+
|
2288
|
+
Example
|
2289
|
+
-------
|
2290
|
+
>>> agentic_sam2_instance_segmentation("flower", image)
|
2291
|
+
[
|
2292
|
+
{
|
2293
|
+
'score': 0.49,
|
2294
|
+
'label': 'flower',
|
2295
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2296
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2297
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2298
|
+
...,
|
2299
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2300
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2301
|
+
},
|
2302
|
+
]
|
2303
|
+
"""
|
2304
|
+
|
2305
|
+
od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
|
2306
|
+
seg_ret = _sam2(
|
2307
|
+
image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
|
2308
|
+
)
|
2309
|
+
|
2310
|
+
_display_tool_trace(
|
2311
|
+
agentic_sam2_instance_segmentation.__name__,
|
2312
|
+
{
|
2313
|
+
"prompts": prompt,
|
2314
|
+
},
|
2315
|
+
seg_ret["display_data"],
|
2316
|
+
seg_ret["files"],
|
2317
|
+
)
|
2318
|
+
|
2319
|
+
return seg_ret["return_data"] # type: ignore
|
2320
|
+
|
2321
|
+
|
2322
|
+
def agentic_sam2_video_tracking(
|
2323
|
+
prompt: str,
|
2324
|
+
frames: List[np.ndarray],
|
2325
|
+
chunk_length: Optional[int] = 10,
|
2326
|
+
fine_tune_id: Optional[str] = None,
|
2327
|
+
) -> List[List[Dict[str, Any]]]:
|
2328
|
+
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
2329
|
+
objects in a video given a text prompt such as category names or referring
|
2330
|
+
expressions. The categories in the text prompt are separated by commas. It returns
|
2331
|
+
a list of bounding boxes, label names, masks and associated probability scores and
|
2332
|
+
is useful for tracking and counting without duplicating counts.
|
2333
|
+
|
2334
|
+
Parameters:
|
2335
|
+
prompt (str): The prompt to ground to the image.
|
2336
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
2337
|
+
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
2338
|
+
to find new objects.
|
2339
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
2340
|
+
fine-tuned model ID here to use it.
|
2341
|
+
|
2342
|
+
Returns:
|
2343
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
2344
|
+
label, segmentation mask and bounding boxes. The outer list represents each
|
2345
|
+
frame and the inner list is the entities per frame. The detected objects
|
2346
|
+
have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
|
2347
|
+
and ymin are the coordinates of the top-left and xmax and ymax are the
|
2348
|
+
coordinates of the bottom-right of the bounding box. The mask is binary 2D
|
2349
|
+
numpy array where 1 indicates the object and 0 indicates the background.
|
2350
|
+
The label names are prefixed with their ID represent the total count.
|
2351
|
+
|
2352
|
+
Example
|
2353
|
+
-------
|
2354
|
+
>>> agentic_sam2_video_tracking("dinosaur", frames)
|
2355
|
+
[
|
2356
|
+
[
|
2357
|
+
{
|
2358
|
+
'label': '0: dinosaur',
|
2359
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2360
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2361
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2362
|
+
...,
|
2363
|
+
[0, 0, 0, ..., 0, 0, 0],
|
2364
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2365
|
+
},
|
2366
|
+
],
|
2367
|
+
...
|
2368
|
+
]
|
2369
|
+
"""
|
2370
|
+
|
2371
|
+
ret = od_sam2_video_tracking(
|
2372
|
+
ODModels.AGENTIC,
|
2373
|
+
prompt=prompt,
|
2374
|
+
frames=frames,
|
2375
|
+
chunk_length=chunk_length,
|
2376
|
+
fine_tune_id=fine_tune_id,
|
2377
|
+
)
|
2378
|
+
_display_tool_trace(
|
2379
|
+
agentic_sam2_video_tracking.__name__,
|
2380
|
+
{},
|
2381
|
+
ret["display_data"],
|
2382
|
+
ret["files"],
|
2383
|
+
)
|
2384
|
+
return ret["return_data"] # type: ignore
|
2385
|
+
|
2386
|
+
|
2143
2387
|
def minimum_distance(
|
2144
2388
|
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
2145
2389
|
) -> float:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.227 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|