vision-agent 1.1.9__tar.gz → 1.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {vision_agent-1.1.9 → vision_agent-1.1.10}/PKG-INFO +8 -1
  2. {vision_agent-1.1.9 → vision_agent-1.1.10}/README.md +7 -0
  3. {vision_agent-1.1.9 → vision_agent-1.1.10}/pyproject.toml +1 -1
  4. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/.sim_tools/df.csv +49 -74
  5. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/.sim_tools/embs.npy +0 -0
  6. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/tools.py +9 -4
  7. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/tools_doc.py +11 -5
  8. {vision_agent-1.1.9 → vision_agent-1.1.10}/.gitignore +0 -0
  9. {vision_agent-1.1.9 → vision_agent-1.1.10}/LICENSE +0 -0
  10. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/__init__.py +0 -0
  11. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/README.md +0 -0
  12. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/__init__.py +0 -0
  13. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/agent.py +0 -0
  14. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  15. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  16. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  17. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  18. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  19. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/agent/vision_agent_v2.py +0 -0
  20. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/clients/__init__.py +0 -0
  21. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/clients/http.py +0 -0
  22. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/__init__.py +0 -0
  23. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/anthropic_config.py +0 -0
  24. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/config.py +0 -0
  25. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/configs/openai_config.py +0 -0
  26. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/fonts/__init__.py +0 -0
  27. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  28. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/lmm/__init__.py +0 -0
  29. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/lmm/lmm.py +0 -0
  30. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/__init__.py +0 -0
  31. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/agent_types.py +0 -0
  32. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/lmm_types.py +0 -0
  33. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/models/tools_types.py +0 -0
  34. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/sim/__init__.py +0 -0
  35. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/sim/sim.py +0 -0
  36. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/__init__.py +0 -0
  37. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/meta_tools.py +0 -0
  38. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/planner_tools.py +0 -0
  39. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/tools/prompts.py +0 -0
  40. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/__init__.py +0 -0
  41. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/agent.py +0 -0
  42. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/exceptions.py +0 -0
  43. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/execute.py +0 -0
  44. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/image_utils.py +0 -0
  45. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/tools.py +0 -0
  46. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/video.py +0 -0
  47. {vision_agent-1.1.9 → vision_agent-1.1.10}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.9
3
+ Version: 1.1.10
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -104,6 +104,13 @@ Anthropic and Google each have their own rate limits and paid tiers. Refer to th
104
104
 
105
105
  ## Installation
106
106
 
107
+ Install with uv:
108
+ ```bash
109
+ uv add vision-agent
110
+ ```
111
+
112
+ Install with pip:
113
+
107
114
  ```bash
108
115
  pip install vision-agent
109
116
  ```
@@ -63,6 +63,13 @@ Anthropic and Google each have their own rate limits and paid tiers. Refer to th
63
63
 
64
64
  ## Installation
65
65
 
66
+ Install with uv:
67
+ ```bash
68
+ uv add vision-agent
69
+ ```
70
+
71
+ Install with pip:
72
+
66
73
  ```bash
67
74
  pip install vision-agent
68
75
  ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "vision-agent"
7
- version = "1.1.9"
7
+ version = "1.1.10"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = [{ name = "Landing AI", email = "dev@landing.ai" }]
10
10
  requires-python = ">=3.9,<4.0"
@@ -24,8 +24,7 @@ desc,doc,name
24
24
  [
25
25
  {'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
26
26
  {'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
27
- ]
28
- ",glee_object_detection
27
+ ]",glee_object_detection
29
28
  "'glee_sam2_instance_segmentation' is a tool that can detect multiple instances given a text prompt such as object names or referring expressions on images. It's particularly good at detecting specific objects given detailed descriptive prompts. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","glee_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
30
29
  'glee_sam2_instance_segmentation' is a tool that can detect multiple
31
30
  instances given a text prompt such as object names or referring expressions on
@@ -60,8 +59,7 @@ desc,doc,name
60
59
  [0, 0, 0, ..., 0, 0, 0],
61
60
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
62
61
  },
63
- ]
64
- ",glee_sam2_instance_segmentation
62
+ ]",glee_sam2_instance_segmentation
65
63
  "'glee_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as object names or referring expressions. It's particularly good at detecting specific objects given detailed descriptive prompts and returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","glee_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
66
64
  'glee_sam2_video_tracking' is a tool that can track and segment multiple
67
65
  objects in a video given a text prompt such as object names or referring
@@ -103,8 +101,7 @@ desc,doc,name
103
101
  },
104
102
  ],
105
103
  ...
106
- ]
107
- ",glee_sam2_video_tracking
104
+ ]",glee_sam2_video_tracking
108
105
  "'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
109
106
  'countgd_object_detection' is a tool that can detect multiple instances of an
110
107
  object given a text prompt. It is particularly useful when trying to detect and
@@ -133,8 +130,7 @@ desc,doc,name
133
130
  {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
134
131
  {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
135
132
  {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
136
- ]
137
- ",countgd_object_detection
133
+ ]",countgd_object_detection
138
134
  "'countgd_sam2_instance_segmentation' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names, masks associated confidence scores.","countgd_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
139
135
  'countgd_sam2_instance_segmentation' is a tool that can detect multiple
140
136
  instances of an object given a text prompt. It is particularly useful when trying
@@ -170,8 +166,7 @@ desc,doc,name
170
166
  [0, 0, 0, ..., 0, 0, 0],
171
167
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
172
168
  },
173
- ]
174
- ",countgd_sam2_instance_segmentation
169
+ ]",countgd_sam2_instance_segmentation
175
170
  "'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
176
171
  'countgd_sam2_video_tracking' is a tool that can track and segment multiple
177
172
  objects in a video given a text prompt such as category names or referring
@@ -213,8 +208,7 @@ desc,doc,name
213
208
  },
214
209
  ],
215
210
  ...
216
- ]
217
- ",countgd_sam2_video_tracking
211
+ ]",countgd_sam2_video_tracking
218
212
  "'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
219
213
  'florence2_ocr' is a tool that can detect text and text regions in an image.
220
214
  Each text region contains one line of text. It returns a list of detected text,
@@ -233,8 +227,7 @@ desc,doc,name
233
227
  >>> florence2_ocr(image)
234
228
  [
235
229
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
236
- ]
237
- ",florence2_ocr
230
+ ]",florence2_ocr
238
231
  "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
239
232
  'florence2_object_detection' is a tool that can detect multiple objects given a
240
233
  text prompt which can be object names or caption. You can optionally separate the
@@ -259,8 +252,7 @@ desc,doc,name
259
252
  [
260
253
  {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
261
254
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
262
- ]
263
- ",florence2_object_detection
255
+ ]",florence2_object_detection
264
256
  "'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray) -> List[Dict[str, Any]]:
265
257
  'florence2_sam2_instance_segmentation' is a tool that can segment multiple
266
258
  objects given a text prompt such as category names or referring expressions. The
@@ -295,8 +287,7 @@ desc,doc,name
295
287
  [0, 0, 0, ..., 0, 0, 0],
296
288
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
297
289
  },
298
- ]
299
- ",florence2_sam2_instance_segmentation
290
+ ]",florence2_sam2_instance_segmentation
300
291
  "'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
301
292
  'florence2_sam2_video_tracking' is a tool that can track and segment multiple
302
293
  objects in a video given a text prompt such as category names or referring
@@ -337,8 +328,7 @@ desc,doc,name
337
328
  },
338
329
  ],
339
330
  ...
340
- ]
341
- ",florence2_sam2_video_tracking
331
+ ]",florence2_sam2_video_tracking
342
332
  'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
343
333
  'claude35_text_extraction' is a tool that can extract text from an image. It
344
334
  returns the extracted text as a string and can be used as an alternative to OCR if
@@ -348,12 +338,11 @@ desc,doc,name
348
338
  image (np.ndarray): The image to extract text from.
349
339
 
350
340
  Returns:
351
- str: The extracted text from the image.
352
- ",claude35_text_extraction
353
- "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
354
- 'document_extraction' is a tool that can extract structured information out of
355
- documents with different layouts. It returns the extracted data in a structured
356
- hierarchical format containing text, tables, pictures, charts, and other
341
+ str: The extracted text from the image.",claude35_text_extraction
342
+ "'agentic_document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, figures, charts, and other information.","agentic_document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
343
+ 'agentic_document_extraction' is a tool that can extract structured information
344
+ out of documents with different layouts. It returns the extracted data in a
345
+ structured hierarchical format containing text, tables, figures, charts, and other
357
346
  information.
358
347
 
359
348
  Parameters:
@@ -364,21 +353,24 @@ desc,doc,name
364
353
 
365
354
  Example
366
355
  -------
367
- >>> document_analysis(image)
368
- {'pages':
369
- [{'bbox': [0, 0, 1.0, 1.0],
370
- 'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
371
- 'label': 'page_header',
372
- 'order': 75
373
- 'caption': 'Annual Report 2024',
374
- 'summary': 'This annual report summarizes ...' },
375
- {'bbox': [0.2, 0.9, 0.9, 1.0],
376
- 'label': 'table',
377
- 'order': 1119,
378
- 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
379
- 'summary': 'This table illustrates a trend of ...'},
356
+ >>> agentic_document_analysis(image)
357
+ {
358
+ ""markdown"": ""# Document title ## Document subtitle This is a sample document."",
359
+ ""chunks"": [
360
+ {
361
+ ""text"": ""# Document title"",
362
+ ""grounding"": [
363
+ {
364
+ ""box"": [0.06125, 0.019355758266818696, 0.17375, 0.03290478905359179],
365
+ ""page"": 0
366
+ }
380
367
  ],
381
- ",document_extraction
368
+ ""chunk_type"": ""page_header"",
369
+ ""chunk_id"": ""622e0374-c50e-4960-a013-650138b42528""
370
+ },
371
+ ...
372
+ ]
373
+ }",agentic_document_extraction
382
374
  "'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
383
375
  'document_qa' is a tool that can answer any questions about arbitrary documents,
384
376
  presentations, or tables. It's very useful for document QA tasks, you can ask it a
@@ -395,8 +387,7 @@ desc,doc,name
395
387
  Example
396
388
  -------
397
389
  >>> document_qa(image, question)
398
- 'The answer to the question ...'
399
- ",document_qa
390
+ 'The answer to the question ...'",document_qa
400
391
  "'ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
401
392
  'ocr' extracts text from an image. It returns a list of detected text, bounding
402
393
  boxes with normalized coordinates, and confidence scores. The results are sorted
@@ -414,8 +405,7 @@ desc,doc,name
414
405
  >>> ocr(image)
415
406
  [
416
407
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
417
- ]
418
- ",ocr
408
+ ]",ocr
419
409
  'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
420
410
  'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
421
411
  images including regular images or images of documents or presentations. It can be
@@ -432,8 +422,7 @@ desc,doc,name
432
422
  Example
433
423
  -------
434
424
  >>> qwen25_vl_images_vqa('Give a summary of the document', images)
435
- 'The document talks about the history of the United States of America and its...'
436
- ",qwen25_vl_images_vqa
425
+ 'The document talks about the history of the United States of America and its...'",qwen25_vl_images_vqa
437
426
  'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen25_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
438
427
  'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
439
428
  including regular videos or videos of documents or presentations. It returns text
@@ -449,8 +438,7 @@ desc,doc,name
449
438
  Example
450
439
  -------
451
440
  >>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
452
- 'Lionel Messi'
453
- ",qwen25_vl_video_vqa
441
+ 'Lionel Messi'",qwen25_vl_video_vqa
454
442
  'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen25vl', chunk_length_frames: int = 10) -> List[float]:
455
443
  'activity_recognition' is a tool that can recognize activities in a video given a
456
444
  text prompt. It can be used to identify where specific activities or actions
@@ -471,8 +459,7 @@ desc,doc,name
471
459
  Example
472
460
  -------
473
461
  >>> activity_recognition('Did a goal happened?', frames)
474
- [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
475
- ",activity_recognition
462
+ [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]",activity_recognition
476
463
  'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
477
464
  'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
478
465
  depth image from a given RGB image. The returned depth image is monochrome and
@@ -492,8 +479,7 @@ desc,doc,name
492
479
  [0, 20, 24, ..., 0, 100, 103],
493
480
  ...,
494
481
  [10, 11, 15, ..., 202, 202, 205],
495
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
496
- ",depth_anything_v2
482
+ [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),",depth_anything_v2
497
483
  'generate_pose_image' is a tool that generates a open pose bone/stick image from a given RGB image. The returned bone image is RGB with the pose amd keypoints colored and background as black.,"generate_pose_image(image: numpy.ndarray) -> numpy.ndarray:
498
484
  'generate_pose_image' is a tool that generates a open pose bone/stick image from
499
485
  a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
@@ -512,8 +498,7 @@ desc,doc,name
512
498
  [0, 20, 24, ..., 0, 100, 103],
513
499
  ...,
514
500
  [10, 11, 15, ..., 202, 202, 205],
515
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
516
- ",generate_pose_image
501
+ [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),",generate_pose_image
517
502
  'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
518
503
  'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
519
504
  It returns the predicted label and their probability scores based on image content.
@@ -528,8 +513,7 @@ desc,doc,name
528
513
  Example
529
514
  -------
530
515
  >>> vit_nsfw_classification(image)
531
- {""label"": ""normal"", ""scores"": 0.68},
532
- ",vit_nsfw_classification
516
+ {""label"": ""normal"", ""scores"": 0.68},",vit_nsfw_classification
533
517
  "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
534
518
  'flux_image_inpainting' performs image inpainting to fill the masked regions,
535
519
  given by mask, in the image, given image based on the text prompt and surrounding
@@ -599,8 +583,7 @@ desc,doc,name
599
583
  Example
600
584
  -------
601
585
  >>> siglip_classification(image, ['dog', 'cat', 'bird'])
602
- {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
603
- ",siglip_classification
586
+ {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},",siglip_classification
604
587
  "'minimum_distance' calculates the minimum distance between two detections which can include bounding boxes and or masks. This will return the closest distance between the objects, not the distance between the centers of the objects.","minimum_distance(det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]) -> float:
605
588
  'minimum_distance' calculates the minimum distance between two detections which
606
589
  can include bounding boxes and or masks. This will return the closest distance
@@ -617,8 +600,7 @@ desc,doc,name
617
600
  Example
618
601
  -------
619
602
  >>> closest_distance(det1, det2, image_size)
620
- 141.42
621
- ",minimum_distance
603
+ 141.42",minimum_distance
622
604
  "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 5) -> List[Dict[str, Union[numpy.ndarray, float]]]:
623
605
  'extract_frames_and_timestamps' extracts frames and timestamps from a video
624
606
  which can be a file path, url or youtube link, returns a list of dictionaries
@@ -638,8 +620,7 @@ desc,doc,name
638
620
  Example
639
621
  -------
640
622
  >>> extract_frames(""path/to/video.mp4"")
641
- [{""frame"": np.ndarray, ""timestamp"": 0.0}, ...]
642
- ",extract_frames_and_timestamps
623
+ [{""frame"": np.ndarray, ""timestamp"": 0.0}, ...]",extract_frames_and_timestamps
643
624
  'save_json' is a utility function that saves data as a JSON file. It is helpful for saving data that contains NumPy arrays which are not JSON serializable.,"save_json(data: Any, file_path: str) -> None:
644
625
  'save_json' is a utility function that saves data as a JSON file. It is helpful
645
626
  for saving data that contains NumPy arrays which are not JSON serializable.
@@ -650,8 +631,7 @@ desc,doc,name
650
631
 
651
632
  Example
652
633
  -------
653
- >>> save_json(data, ""path/to/file.json"")
654
- ",save_json
634
+ >>> save_json(data, ""path/to/file.json"")",save_json
655
635
  'load_image' is a utility function that loads an image from the given file path string or an URL.,"load_image(image_path: str) -> numpy.ndarray:
656
636
  'load_image' is a utility function that loads an image from the given file path string or an URL.
657
637
 
@@ -663,8 +643,7 @@ desc,doc,name
663
643
 
664
644
  Example
665
645
  -------
666
- >>> load_image(""path/to/image.jpg"")
667
- ",load_image
646
+ >>> load_image(""path/to/image.jpg"")",load_image
668
647
  'save_image' is a utility function that saves an image to a file path.,"save_image(image: numpy.ndarray, file_path: str) -> None:
669
648
  'save_image' is a utility function that saves an image to a file path.
670
649
 
@@ -674,8 +653,7 @@ desc,doc,name
674
653
 
675
654
  Example
676
655
  -------
677
- >>> save_image(image)
678
- ",save_image
656
+ >>> save_image(image)",save_image
679
657
  'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
680
658
  'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
681
659
 
@@ -690,8 +668,7 @@ desc,doc,name
690
668
  Example
691
669
  -------
692
670
  >>> save_video(frames)
693
- ""/tmp/tmpvideo123.mp4""
694
- ",save_video
671
+ ""/tmp/tmpvideo123.mp4""",save_video
695
672
  'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image. It will draw a box around the detected object with the label and score.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
696
673
  'overlay_bounding_boxes' is a utility function that displays bounding boxes on
697
674
  an image. It will draw a box around the detected object with the label and score.
@@ -710,8 +687,7 @@ desc,doc,name
710
687
  -------
711
688
  >>> image_with_bboxes = overlay_bounding_boxes(
712
689
  image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
713
- )
714
- ",overlay_bounding_boxes
690
+ )",overlay_bounding_boxes
715
691
  'overlay_segmentation_masks' is a utility function that displays segmentation masks. It will overlay a colored mask on the detected object with the label.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
716
692
  'overlay_segmentation_masks' is a utility function that displays segmentation
717
693
  masks. It will overlay a colored mask on the detected object with the label.
@@ -742,5 +718,4 @@ desc,doc,name
742
718
  [0, 0, 0, ..., 0, 0, 0],
743
719
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
744
720
  }],
745
- )
746
- ",overlay_segmentation_masks
721
+ )",overlay_segmentation_masks
@@ -2195,9 +2195,9 @@ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2195
2195
 
2196
2196
 
2197
2197
  def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
2198
- """'agentic_document_extraction' is a tool that can extract structured information out of
2199
- documents with different layouts. It returns the extracted data in a structured
2200
- hierarchical format containing text, tables, figures, charts, and other
2198
+ """'agentic_document_extraction' is a tool that can extract structured information
2199
+ out of documents with different layouts. It returns the extracted data in a
2200
+ structured hierarchical format containing text, tables, figures, charts, and other
2201
2201
  information.
2202
2202
 
2203
2203
  Parameters:
@@ -2210,7 +2210,7 @@ def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
2210
2210
  -------
2211
2211
  >>> agentic_document_analysis(image)
2212
2212
  {
2213
- "markdown": "# Document title\n\n## Document subtitle\n\nThis is a sample document.",
2213
+ "markdown": "# Document title ## Document subtitle This is a sample document.",
2214
2214
  "chunks": [
2215
2215
  {
2216
2216
  "text": "# Document title",
@@ -2226,6 +2226,11 @@ def agentic_document_extraction(image: np.ndarray) -> Dict[str, Any]:
2226
2226
  ...
2227
2227
  ]
2228
2228
  }
2229
+
2230
+ Notes
2231
+ ----
2232
+ For more document analysis features, please use the agentic-doc python package at
2233
+ https://github.com/landing-ai/agentic-doc
2229
2234
  """
2230
2235
 
2231
2236
  image_file = numpy_to_bytes(image)
@@ -7,15 +7,21 @@ import pandas as pd
7
7
  def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
8
8
  docstrings = ""
9
9
  for func in funcs:
10
- docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
10
+ docstrings += f"{func.__name__}{inspect.signature(func)}:\n{strip_notes(func.__doc__)}\n\n"
11
11
 
12
12
  return docstrings
13
13
 
14
14
 
15
+ def strip_notes(doc: Optional[str]) -> Optional[str]:
16
+ if doc is None:
17
+ return None
18
+ return doc[: doc.find("Notes\n")].strip()
19
+
20
+
15
21
  def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
16
22
  descriptions = ""
17
23
  for func in funcs:
18
- description = func.__doc__
24
+ description = strip_notes(func.__doc__)
19
25
  if description is None:
20
26
  description = ""
21
27
 
@@ -60,13 +66,13 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
60
66
  data: Dict[str, List[str]] = {"desc": [], "doc": [], "name": []}
61
67
 
62
68
  for func in funcs:
63
- desc = func.__doc__
69
+ desc = strip_notes(func.__doc__)
64
70
  if desc is None:
65
71
  desc = ""
66
72
  desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
67
73
  desc = " ".join(desc.split())
68
74
 
69
- doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
75
+ doc = f"{func.__name__}{inspect.signature(func)}:\n{strip_notes(func.__doc__)}"
70
76
  data["desc"].append(desc)
71
77
  data["doc"].append(doc)
72
78
  data["name"].append(func.__name__)
@@ -78,7 +84,7 @@ def get_tools_info(funcs: List[Callable[..., Any]]) -> Dict[str, str]:
78
84
  data: Dict[str, str] = {}
79
85
 
80
86
  for func in funcs:
81
- desc = func.__doc__
87
+ desc = strip_notes(func.__doc__)
82
88
  if desc is None:
83
89
  desc = ""
84
90
 
File without changes
File without changes