vision-agent 1.1.9__tar.gz → 1.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-1.1.9 → vision_agent-1.1.10}/PKG-INFO +8 -1
- {vision_agent-1.1.9 → vision_agent-1.1.10}/README.md +7 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/pyproject.toml +1 -1
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/.sim_tools/df.csv +49 -74
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/tools.py +9 -4
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/tools_doc.py +11 -5
- {vision_agent-1.1.9 → vision_agent-1.1.10}/.gitignore +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/LICENSE +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/README.md +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/agent.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/clients/http.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/anthropic_config.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/config.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/openai_config.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/agent_types.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/lmm_types.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/tools_types.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/sim/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/sim/sim.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/planner_tools.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/agent.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/execute.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/tools.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/video.py +0 -0
- {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.10
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Project-URL: Homepage, https://landing.ai
|
6
6
|
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
@@ -104,6 +104,13 @@ Anthropic and Google each have their own rate limits and paid tiers. Refer to th
|
|
104
104
|
|
105
105
|
## Installation
|
106
106
|
|
107
|
+
Install with uv:
|
108
|
+
```bash
|
109
|
+
uv add vision-agent
|
110
|
+
```
|
111
|
+
|
112
|
+
Install with pip:
|
113
|
+
|
107
114
|
```bash
|
108
115
|
pip install vision-agent
|
109
116
|
```
|
@@ -24,8 +24,7 @@ desc,doc,name
|
|
24
24
|
[
|
25
25
|
{'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
26
26
|
{'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
27
|
-
]
|
28
|
-
",glee_object_detection
|
27
|
+
]",glee_object_detection
|
29
28
|
"'glee_sam2_instance_segmentation' is a tool that can detect multiple instances given a text prompt such as object names or referring expressions on images. It's particularly good at detecting specific objects given detailed descriptive prompts. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","glee_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
30
29
|
'glee_sam2_instance_segmentation' is a tool that can detect multiple
|
31
30
|
instances given a text prompt such as object names or referring expressions on
|
@@ -60,8 +59,7 @@ desc,doc,name
|
|
60
59
|
[0, 0, 0, ..., 0, 0, 0],
|
61
60
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
62
61
|
},
|
63
|
-
]
|
64
|
-
",glee_sam2_instance_segmentation
|
62
|
+
]",glee_sam2_instance_segmentation
|
65
63
|
"'glee_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as object names or referring expressions. It's particularly good at detecting specific objects given detailed descriptive prompts and returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","glee_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
66
64
|
'glee_sam2_video_tracking' is a tool that can track and segment multiple
|
67
65
|
objects in a video given a text prompt such as object names or referring
|
@@ -103,8 +101,7 @@ desc,doc,name
|
|
103
101
|
},
|
104
102
|
],
|
105
103
|
...
|
106
|
-
]
|
107
|
-
",glee_sam2_video_tracking
|
104
|
+
]",glee_sam2_video_tracking
|
108
105
|
"'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
109
106
|
'countgd_object_detection' is a tool that can detect multiple instances of an
|
110
107
|
object given a text prompt. It is particularly useful when trying to detect and
|
@@ -133,8 +130,7 @@ desc,doc,name
|
|
133
130
|
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
134
131
|
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
|
135
132
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
136
|
-
]
|
137
|
-
",countgd_object_detection
|
133
|
+
]",countgd_object_detection
|
138
134
|
"'countgd_sam2_instance_segmentation' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names, masks associated confidence scores.","countgd_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
139
135
|
'countgd_sam2_instance_segmentation' is a tool that can detect multiple
|
140
136
|
instances of an object given a text prompt. It is particularly useful when trying
|
@@ -170,8 +166,7 @@ desc,doc,name
|
|
170
166
|
[0, 0, 0, ..., 0, 0, 0],
|
171
167
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
172
168
|
},
|
173
|
-
]
|
174
|
-
",countgd_sam2_instance_segmentation
|
169
|
+
]",countgd_sam2_instance_segmentation
|
175
170
|
"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
176
171
|
'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
177
172
|
objects in a video given a text prompt such as category names or referring
|
@@ -213,8 +208,7 @@ desc,doc,name
|
|
213
208
|
},
|
214
209
|
],
|
215
210
|
...
|
216
|
-
]
|
217
|
-
",countgd_sam2_video_tracking
|
211
|
+
]",countgd_sam2_video_tracking
|
218
212
|
"'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
219
213
|
'florence2_ocr' is a tool that can detect text and text regions in an image.
|
220
214
|
Each text region contains one line of text. It returns a list of detected text,
|
@@ -233,8 +227,7 @@ desc,doc,name
|
|
233
227
|
>>> florence2_ocr(image)
|
234
228
|
[
|
235
229
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
236
|
-
]
|
237
|
-
",florence2_ocr
|
230
|
+
]",florence2_ocr
|
238
231
|
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
|
239
232
|
'florence2_object_detection' is a tool that can detect multiple objects given a
|
240
233
|
text prompt which can be object names or caption. You can optionally separate the
|
@@ -259,8 +252,7 @@ desc,doc,name
|
|
259
252
|
[
|
260
253
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
261
254
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
262
|
-
]
|
263
|
-
",florence2_object_detection
|
255
|
+
]",florence2_object_detection
|
264
256
|
"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
|
265
257
|
'florence2_sam2_instance_segmentation' is a tool that can segment multiple
|
266
258
|
objects given a text prompt such as category names or referring expressions. The
|
@@ -295,8 +287,7 @@ desc,doc,name
|
|
295
287
|
[0, 0, 0, ..., 0, 0, 0],
|
296
288
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
297
289
|
},
|
298
|
-
]
|
299
|
-
",florence2_sam2_instance_segmentation
|
290
|
+
]",florence2_sam2_instance_segmentation
|
300
291
|
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
301
292
|
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
302
293
|
objects in a video given a text prompt such as category names or referring
|
@@ -337,8 +328,7 @@ desc,doc,name
|
|
337
328
|
},
|
338
329
|
],
|
339
330
|
...
|
340
|
-
]
|
341
|
-
",florence2_sam2_video_tracking
|
331
|
+
]",florence2_sam2_video_tracking
|
342
332
|
'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
|
343
333
|
'claude35_text_extraction' is a tool that can extract text from an image. It
|
344
334
|
returns the extracted text as a string and can be used as an alternative to OCR if
|
@@ -348,12 +338,11 @@ desc,doc,name
|
|
348
338
|
image (np.ndarray): The image to extract text from.
|
349
339
|
|
350
340
|
Returns:
|
351
|
-
str: The extracted text from the image.
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
hierarchical format containing text, tables, pictures, charts, and other
|
341
|
+
str: The extracted text from the image.",claude35_text_extraction
|
342
|
+
"'agentic_document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, figures, charts, and other information.","agentic_document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
|
343
|
+
'agentic_document_extraction' is a tool that can extract structured information
|
344
|
+
out of documents with different layouts. It returns the extracted data in a
|
345
|
+
structured hierarchical format containing text, tables, figures, charts, and other
|
357
346
|
information.
|
358
347
|
|
359
348
|
Parameters:
|
@@ -364,21 +353,24 @@ desc,doc,name
|
|
364
353
|
|
365
354
|
Example
|
366
355
|
-------
|
367
|
-
>>>
|
368
|
-
{
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
|
379
|
-
'summary': 'This table illustrates a trend of ...'},
|
356
|
+
>>> agentic_document_analysis(image)
|
357
|
+
{
|
358
|
+
""markdown"": ""# Document title ## Document subtitle This is a sample document."",
|
359
|
+
""chunks"": [
|
360
|
+
{
|
361
|
+
""text"": ""# Document title"",
|
362
|
+
""grounding"": [
|
363
|
+
{
|
364
|
+
""box"": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
|
365
|
+
""page"": 0
|
366
|
+
}
|
380
367
|
],
|
381
|
-
|
368
|
+
""chunk_type"": ""page_header"",
|
369
|
+
""chunk_id"": ""622e0374-c50e-4960-a013-650138b42528""
|
370
|
+
},
|
371
|
+
...
|
372
|
+
]
|
373
|
+
}",agentic_document_extraction
|
382
374
|
"'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
|
383
375
|
'document_qa' is a tool that can answer any questions about arbitrary documents,
|
384
376
|
presentations, or tables. It's very useful for document QA tasks, you can ask it a
|
@@ -395,8 +387,7 @@ desc,doc,name
|
|
395
387
|
Example
|
396
388
|
-------
|
397
389
|
>>> document_qa(image, question)
|
398
|
-
'The answer to the question ...'
|
399
|
-
",document_qa
|
390
|
+
'The answer to the question ...'",document_qa
|
400
391
|
"'ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
401
392
|
'ocr' extracts text from an image. It returns a list of detected text, bounding
|
402
393
|
boxes with normalized coordinates, and confidence scores. The results are sorted
|
@@ -414,8 +405,7 @@ desc,doc,name
|
|
414
405
|
>>> ocr(image)
|
415
406
|
[
|
416
407
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
417
|
-
]
|
418
|
-
",ocr
|
408
|
+
]",ocr
|
419
409
|
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
420
410
|
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
421
411
|
images including regular images or images of documents or presentations. It can be
|
@@ -432,8 +422,7 @@ desc,doc,name
|
|
432
422
|
Example
|
433
423
|
-------
|
434
424
|
>>> qwen25_vl_images_vqa('Give a summary of the document', images)
|
435
|
-
'The document talks about the history of the United States of America and its...'
|
436
|
-
",qwen25_vl_images_vqa
|
425
|
+
'The document talks about the history of the United States of America and its...'",qwen25_vl_images_vqa
|
437
426
|
'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen25_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
|
438
427
|
'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
439
428
|
including regular videos or videos of documents or presentations. It returns text
|
@@ -449,8 +438,7 @@ desc,doc,name
|
|
449
438
|
Example
|
450
439
|
-------
|
451
440
|
>>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
|
452
|
-
'Lionel Messi'
|
453
|
-
",qwen25_vl_video_vqa
|
441
|
+
'Lionel Messi'",qwen25_vl_video_vqa
|
454
442
|
'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen25vl', chunk_length_frames: int = 10) -> List[float]:
|
455
443
|
'activity_recognition' is a tool that can recognize activities in a video given a
|
456
444
|
text prompt. It can be used to identify where specific activities or actions
|
@@ -471,8 +459,7 @@ desc,doc,name
|
|
471
459
|
Example
|
472
460
|
-------
|
473
461
|
>>> activity_recognition('Did a goal happened?', frames)
|
474
|
-
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
475
|
-
",activity_recognition
|
462
|
+
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",activity_recognition
|
476
463
|
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
477
464
|
'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
|
478
465
|
depth image from a given RGB image. The returned depth image is monochrome and
|
@@ -492,8 +479,7 @@ desc,doc,name
|
|
492
479
|
[0, 20, 24, ..., 0, 100, 103],
|
493
480
|
...,
|
494
481
|
[10, 11, 15, ..., 202, 202, 205],
|
495
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
496
|
-
",depth_anything_v2
|
482
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),",depth_anything_v2
|
497
483
|
'generate_pose_image' is a tool that generates a open pose bone/stick image from a given RGB image. The returned bone image is RGB with the pose amd keypoints colored and background as black.,"generate_pose_image(image: numpy.ndarray) -> numpy.ndarray:
|
498
484
|
'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
499
485
|
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
@@ -512,8 +498,7 @@ desc,doc,name
|
|
512
498
|
[0, 20, 24, ..., 0, 100, 103],
|
513
499
|
...,
|
514
500
|
[10, 11, 15, ..., 202, 202, 205],
|
515
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
516
|
-
",generate_pose_image
|
501
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),",generate_pose_image
|
517
502
|
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
518
503
|
'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
519
504
|
It returns the predicted label and their probability scores based on image content.
|
@@ -528,8 +513,7 @@ desc,doc,name
|
|
528
513
|
Example
|
529
514
|
-------
|
530
515
|
>>> vit_nsfw_classification(image)
|
531
|
-
{""label"": ""normal"", ""scores"": 0.68},
|
532
|
-
",vit_nsfw_classification
|
516
|
+
{""label"": ""normal"", ""scores"": 0.68},",vit_nsfw_classification
|
533
517
|
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
534
518
|
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
535
519
|
given by mask, in the image, given image based on the text prompt and surrounding
|
@@ -599,8 +583,7 @@ desc,doc,name
|
|
599
583
|
Example
|
600
584
|
-------
|
601
585
|
>>> siglip_classification(image, ['dog', 'cat', 'bird'])
|
602
|
-
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
603
|
-
",siglip_classification
|
586
|
+
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},",siglip_classification
|
604
587
|
"'minimum_distance' calculates the minimum distance between two detections which can include bounding boxes and or masks. This will return the closest distance between the objects, not the distance between the centers of the objects.","minimum_distance(det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]) -> float:
|
605
588
|
'minimum_distance' calculates the minimum distance between two detections which
|
606
589
|
can include bounding boxes and or masks. This will return the closest distance
|
@@ -617,8 +600,7 @@ desc,doc,name
|
|
617
600
|
Example
|
618
601
|
-------
|
619
602
|
>>> closest_distance(det1, det2, image_size)
|
620
|
-
141.42
|
621
|
-
",minimum_distance
|
603
|
+
141.42",minimum_distance
|
622
604
|
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 5) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
623
605
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
624
606
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
@@ -638,8 +620,7 @@ desc,doc,name
|
|
638
620
|
Example
|
639
621
|
-------
|
640
622
|
>>> extract_frames(""path/to/video.mp4"")
|
641
|
-
[{""frame"": np.ndarray, ""timestamp"": 0.0}, ...]
|
642
|
-
",extract_frames_and_timestamps
|
623
|
+
[{""frame"": np.ndarray, ""timestamp"": 0.0}, ...]",extract_frames_and_timestamps
|
643
624
|
'save_json' is a utility function that saves data as a JSON file. It is helpful for saving data that contains NumPy arrays which are not JSON serializable.,"save_json(data: Any, file_path: str) -> None:
|
644
625
|
'save_json' is a utility function that saves data as a JSON file. It is helpful
|
645
626
|
for saving data that contains NumPy arrays which are not JSON serializable.
|
@@ -650,8 +631,7 @@ desc,doc,name
|
|
650
631
|
|
651
632
|
Example
|
652
633
|
-------
|
653
|
-
>>> save_json(data, ""path/to/file.json"")
|
654
|
-
",save_json
|
634
|
+
>>> save_json(data, ""path/to/file.json"")",save_json
|
655
635
|
'load_image' is a utility function that loads an image from the given file path string or an URL.,"load_image(image_path: str) -> numpy.ndarray:
|
656
636
|
'load_image' is a utility function that loads an image from the given file path string or an URL.
|
657
637
|
|
@@ -663,8 +643,7 @@ desc,doc,name
|
|
663
643
|
|
664
644
|
Example
|
665
645
|
-------
|
666
|
-
>>> load_image(""path/to/image.jpg"")
|
667
|
-
",load_image
|
646
|
+
>>> load_image(""path/to/image.jpg"")",load_image
|
668
647
|
'save_image' is a utility function that saves an image to a file path.,"save_image(image: numpy.ndarray, file_path: str) -> None:
|
669
648
|
'save_image' is a utility function that saves an image to a file path.
|
670
649
|
|
@@ -674,8 +653,7 @@ desc,doc,name
|
|
674
653
|
|
675
654
|
Example
|
676
655
|
-------
|
677
|
-
>>> save_image(image)
|
678
|
-
",save_image
|
656
|
+
>>> save_image(image)",save_image
|
679
657
|
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
|
680
658
|
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
681
659
|
|
@@ -690,8 +668,7 @@ desc,doc,name
|
|
690
668
|
Example
|
691
669
|
-------
|
692
670
|
>>> save_video(frames)
|
693
|
-
""/tmp/tmpvideo123.mp4""
|
694
|
-
",save_video
|
671
|
+
""/tmp/tmpvideo123.mp4""",save_video
|
695
672
|
'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image. It will draw a box around the detected object with the label and score.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
696
673
|
'overlay_bounding_boxes' is a utility function that displays bounding boxes on
|
697
674
|
an image. It will draw a box around the detected object with the label and score.
|
@@ -710,8 +687,7 @@ desc,doc,name
|
|
710
687
|
-------
|
711
688
|
>>> image_with_bboxes = overlay_bounding_boxes(
|
712
689
|
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
713
|
-
)
|
714
|
-
",overlay_bounding_boxes
|
690
|
+
)",overlay_bounding_boxes
|
715
691
|
'overlay_segmentation_masks' is a utility function that displays segmentation masks. It will overlay a colored mask on the detected object with the label.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
716
692
|
'overlay_segmentation_masks' is a utility function that displays segmentation
|
717
693
|
masks. It will overlay a colored mask on the detected object with the label.
|
@@ -742,5 +718,4 @@ desc,doc,name
|
|
742
718
|
[0, 0, 0, ..., 0, 0, 0],
|
743
719
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
744
720
|
}],
|
745
|
-
)
|
746
|
-
",overlay_segmentation_masks
|
721
|
+
)",overlay_segmentation_masks
|
Binary file
|
@@ -2195,9 +2195,9 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2195
2195
|
|
2196
2196
|
|
2197
2197
|
def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
2198
|
-
"""'agentic_document_extraction' is a tool that can extract structured information
|
2199
|
-
documents with different layouts. It returns the extracted data in a
|
2200
|
-
hierarchical format containing text, tables, figures, charts, and other
|
2198
|
+
"""'agentic_document_extraction' is a tool that can extract structured information
|
2199
|
+
out of documents with different layouts. It returns the extracted data in a
|
2200
|
+
structured hierarchical format containing text, tables, figures, charts, and other
|
2201
2201
|
information.
|
2202
2202
|
|
2203
2203
|
Parameters:
|
@@ -2210,7 +2210,7 @@ def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2210
2210
|
-------
|
2211
2211
|
>>> agentic_document_analysis(image)
|
2212
2212
|
{
|
2213
|
-
"markdown": "# Document title
|
2213
|
+
"markdown": "# Document title ## Document subtitle This is a sample document.",
|
2214
2214
|
"chunks": [
|
2215
2215
|
{
|
2216
2216
|
"text": "# Document title",
|
@@ -2226,6 +2226,11 @@ def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
|
2226
2226
|
...
|
2227
2227
|
]
|
2228
2228
|
}
|
2229
|
+
|
2230
|
+
Notes
|
2231
|
+
----
|
2232
|
+
For more document analysis features, please use the agentic-doc python package at
|
2233
|
+
https://github.com/landing-ai/agentic-doc
|
2229
2234
|
"""
|
2230
2235
|
|
2231
2236
|
image_file = numpy_to_bytes(image)
|
@@ -7,15 +7,21 @@ import pandas as pd
|
|
7
7
|
def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
|
8
8
|
docstrings = ""
|
9
9
|
for func in funcs:
|
10
|
-
docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
|
10
|
+
docstrings += f"{func.__name__}{inspect.signature(func)}:\n{strip_notes(func.__doc__)}\n\n"
|
11
11
|
|
12
12
|
return docstrings
|
13
13
|
|
14
14
|
|
15
|
+
def strip_notes(doc: Optional[str]) -> Optional[str]:
|
16
|
+
if doc is None:
|
17
|
+
return None
|
18
|
+
return doc[: doc.find("Notes\n")].strip()
|
19
|
+
|
20
|
+
|
15
21
|
def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
|
16
22
|
descriptions = ""
|
17
23
|
for func in funcs:
|
18
|
-
description = func.__doc__
|
24
|
+
description = strip_notes(func.__doc__)
|
19
25
|
if description is None:
|
20
26
|
description = ""
|
21
27
|
|
@@ -60,13 +66,13 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
|
|
60
66
|
data: Dict[str, List[str]] = {"desc": [], "doc": [], "name": []}
|
61
67
|
|
62
68
|
for func in funcs:
|
63
|
-
desc = func.__doc__
|
69
|
+
desc = strip_notes(func.__doc__)
|
64
70
|
if desc is None:
|
65
71
|
desc = ""
|
66
72
|
desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
|
67
73
|
desc = " ".join(desc.split())
|
68
74
|
|
69
|
-
doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
|
75
|
+
doc = f"{func.__name__}{inspect.signature(func)}:\n{strip_notes(func.__doc__)}"
|
70
76
|
data["desc"].append(desc)
|
71
77
|
data["doc"].append(doc)
|
72
78
|
data["name"].append(func.__name__)
|
@@ -78,7 +84,7 @@ def get_tools_info(funcs: List[Callable[..., Any]]) -> Dict[str, str]:
|
|
78
84
|
data: Dict[str, str] = {}
|
79
85
|
|
80
86
|
for func in funcs:
|
81
|
-
desc = func.__doc__
|
87
|
+
desc = strip_notes(func.__doc__)
|
82
88
|
if desc is None:
|
83
89
|
desc = ""
|
84
90
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|