vision-agent 0.2.239__tar.gz → 0.2.241__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.239 → vision_agent-0.2.241}/PKG-INFO +1 -1
- {vision_agent-0.2.239 → vision_agent-0.2.241}/pyproject.toml +1 -1
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/tools/__init__.py +0 -2
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/tools/meta_tools.py +1 -124
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/tools/tools.py +23 -111
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/exceptions.py +0 -7
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/video.py +25 -34
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/video_tracking.py +8 -3
- vision_agent-0.2.239/vision_agent/clients/landing_public_api.py +0 -38
- {vision_agent-0.2.239 → vision_agent-0.2.241}/LICENSE +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/README.md +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/.sim_tools/df.csv +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/configs/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/configs/anthropic_config.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/configs/anthropic_openai_config.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/configs/config.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/configs/openai_config.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/models/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/models/agent_types.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/models/lmm_types.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/models/tools_types.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/sim/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/sim/sim.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/tools/planner_tools.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/agent.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/tools.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/tools_doc.py +0 -0
- {vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/utils/type_defs.py +0 -0
@@ -11,11 +11,9 @@ import libcst as cst
|
|
11
11
|
from IPython.display import display
|
12
12
|
|
13
13
|
import vision_agent as va
|
14
|
-
from vision_agent.
|
15
|
-
from vision_agent.models import BboxInput, BboxInputBase64, Message, PromptTask
|
14
|
+
from vision_agent.models import Message
|
16
15
|
from vision_agent.tools.tools import get_tools_descriptions as _get_tool_descriptions
|
17
16
|
from vision_agent.utils.execute import Execution, MimeType
|
18
|
-
from vision_agent.utils.image_utils import convert_to_b64
|
19
17
|
from vision_agent.utils.tools_doc import get_tool_documentation
|
20
18
|
|
21
19
|
CURRENT_FILE = None
|
@@ -573,48 +571,6 @@ def get_tool_descriptions() -> str:
|
|
573
571
|
return _get_tool_descriptions()
|
574
572
|
|
575
573
|
|
576
|
-
def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
|
577
|
-
"""DO NOT use this function unless the user has supplied you with bboxes.
|
578
|
-
'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
|
579
|
-
be able to detect objects in an image based on a given dataset. It returns the fine
|
580
|
-
tuning job id.
|
581
|
-
|
582
|
-
Parameters:
|
583
|
-
bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
|
584
|
-
and bounding boxes. The coordinates are unnormalized.
|
585
|
-
|
586
|
-
Returns:
|
587
|
-
str: The fine tuning job id, this id will used to retrieve the fine tuned
|
588
|
-
model.
|
589
|
-
|
590
|
-
Example
|
591
|
-
-------
|
592
|
-
>>> fine_tuning_job_id = object_detection_fine_tuning(
|
593
|
-
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
594
|
-
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
595
|
-
"phrase_grounding"
|
596
|
-
)
|
597
|
-
"""
|
598
|
-
task = "phrase_grounding"
|
599
|
-
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
600
|
-
task_type = PromptTask[task.upper()]
|
601
|
-
fine_tuning_request = [
|
602
|
-
BboxInputBase64(
|
603
|
-
image=convert_to_b64(bbox_input.image_path),
|
604
|
-
filename=Path(bbox_input.image_path).name,
|
605
|
-
labels=bbox_input.labels,
|
606
|
-
bboxes=bbox_input.bboxes,
|
607
|
-
)
|
608
|
-
for bbox_input in bboxes_input
|
609
|
-
]
|
610
|
-
landing_api = LandingPublicAPI()
|
611
|
-
fine_tune_id = str(
|
612
|
-
landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
|
613
|
-
)
|
614
|
-
print(f"[Fine tuning id: {fine_tune_id}]")
|
615
|
-
return fine_tune_id
|
616
|
-
|
617
|
-
|
618
574
|
def get_diff(before: str, after: str) -> str:
|
619
575
|
return "".join(
|
620
576
|
difflib.unified_diff(
|
@@ -721,83 +677,6 @@ def use_extra_vision_agent_args(
|
|
721
677
|
return modified_tree.code
|
722
678
|
|
723
679
|
|
724
|
-
def use_object_detection_fine_tuning(
|
725
|
-
artifacts: Artifacts, name: str, fine_tune_id: str
|
726
|
-
) -> str:
|
727
|
-
"""Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
|
728
|
-
'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
|
729
|
-
the fined tuned florence2 model. Returns the diff between the original code and the
|
730
|
-
new code.
|
731
|
-
|
732
|
-
Parameters:
|
733
|
-
artifacts (Artifacts): The artifacts object to edit the code from.
|
734
|
-
name (str): The name of the artifact to edit.
|
735
|
-
fine_tune_id (str): The fine tuning job id.
|
736
|
-
|
737
|
-
Examples
|
738
|
-
--------
|
739
|
-
>>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
740
|
-
"""
|
741
|
-
|
742
|
-
if name not in artifacts:
|
743
|
-
output_str = f"[Artifact {name} does not exist]"
|
744
|
-
print(output_str)
|
745
|
-
return output_str
|
746
|
-
|
747
|
-
code = artifacts[name]
|
748
|
-
|
749
|
-
patterns_with_fine_tune_id = [
|
750
|
-
(
|
751
|
-
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
752
|
-
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
753
|
-
),
|
754
|
-
(
|
755
|
-
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
756
|
-
lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
757
|
-
),
|
758
|
-
(
|
759
|
-
r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
760
|
-
lambda match: f'owl_v2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
761
|
-
),
|
762
|
-
(
|
763
|
-
r'florence2_sam2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
764
|
-
lambda match: f'florence2_sam2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
765
|
-
),
|
766
|
-
]
|
767
|
-
|
768
|
-
new_code = code
|
769
|
-
for (
|
770
|
-
pattern_with_fine_tune_id,
|
771
|
-
replacer_with_fine_tune_id,
|
772
|
-
) in patterns_with_fine_tune_id:
|
773
|
-
if re.search(pattern_with_fine_tune_id, new_code):
|
774
|
-
new_code = re.sub(
|
775
|
-
pattern_with_fine_tune_id, replacer_with_fine_tune_id, new_code
|
776
|
-
)
|
777
|
-
|
778
|
-
if new_code == code:
|
779
|
-
output_str = (
|
780
|
-
f"[No function calls to replace with fine tuning id in artifact {name}]"
|
781
|
-
)
|
782
|
-
print(output_str)
|
783
|
-
return output_str
|
784
|
-
|
785
|
-
artifacts[name] = new_code
|
786
|
-
|
787
|
-
diff = get_diff_with_prompts(name, code, new_code)
|
788
|
-
print(diff)
|
789
|
-
|
790
|
-
display(
|
791
|
-
{
|
792
|
-
MimeType.APPLICATION_ARTIFACT: json.dumps(
|
793
|
-
{"name": name, "content": new_code, "action": "edit"}
|
794
|
-
)
|
795
|
-
},
|
796
|
-
raw=True,
|
797
|
-
)
|
798
|
-
return diff
|
799
|
-
|
800
|
-
|
801
680
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
802
681
|
[
|
803
682
|
get_tool_descriptions,
|
@@ -807,8 +686,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
807
686
|
generate_vision_code,
|
808
687
|
edit_vision_code,
|
809
688
|
view_media_artifact,
|
810
|
-
object_detection_fine_tuning,
|
811
|
-
use_object_detection_fine_tuning,
|
812
689
|
list_artifacts,
|
813
690
|
]
|
814
691
|
)
|
@@ -8,8 +8,7 @@ from base64 import b64encode
|
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
9
|
from importlib import resources
|
10
10
|
from pathlib import Path
|
11
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
12
|
-
from uuid import UUID
|
11
|
+
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
13
12
|
|
14
13
|
import cv2
|
15
14
|
import numpy as np
|
@@ -20,10 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
|
|
20
19
|
from pillow_heif import register_heif_opener # type: ignore
|
21
20
|
from pytube import YouTube # type: ignore
|
22
21
|
|
23
|
-
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
24
22
|
from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
|
25
|
-
from vision_agent.models import JobStatus
|
26
|
-
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
27
23
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
28
24
|
from vision_agent.utils.image_utils import (
|
29
25
|
b64_to_pil,
|
@@ -239,7 +235,7 @@ def od_sam2_video_tracking(
|
|
239
235
|
frames: List[np.ndarray],
|
240
236
|
box_threshold: float = 0.30,
|
241
237
|
chunk_length: Optional[int] = 50,
|
242
|
-
|
238
|
+
deployment_id: Optional[str] = None,
|
243
239
|
) -> Dict[str, Any]:
|
244
240
|
chunk_length = 50 if chunk_length is None else chunk_length
|
245
241
|
segment_size = chunk_length
|
@@ -262,7 +258,7 @@ def od_sam2_video_tracking(
|
|
262
258
|
prompt: str,
|
263
259
|
segment_index: int,
|
264
260
|
frame_number: int,
|
265
|
-
|
261
|
+
deployment_id: str,
|
266
262
|
segment_frames: list,
|
267
263
|
) -> tuple:
|
268
264
|
"""
|
@@ -273,7 +269,7 @@ def od_sam2_video_tracking(
|
|
273
269
|
prompt: The prompt for the object detection model.
|
274
270
|
segment_index: The index of the current segment.
|
275
271
|
frame_number: The number of the current frame.
|
276
|
-
|
272
|
+
deployment_id: Optional The Model deployment ID.
|
277
273
|
segment_frames: List of frames for the current segment.
|
278
274
|
|
279
275
|
Returns:
|
@@ -293,7 +289,6 @@ def od_sam2_video_tracking(
|
|
293
289
|
prompt=prompt,
|
294
290
|
image=segment_frames[frame_number],
|
295
291
|
box_threshold=box_threshold,
|
296
|
-
fine_tune_id=fine_tune_id,
|
297
292
|
)
|
298
293
|
function_name = "owlv2_object_detection"
|
299
294
|
|
@@ -301,7 +296,6 @@ def od_sam2_video_tracking(
|
|
301
296
|
segment_results = florence2_object_detection(
|
302
297
|
prompt=prompt,
|
303
298
|
image=segment_frames[frame_number],
|
304
|
-
fine_tune_id=fine_tune_id,
|
305
299
|
)
|
306
300
|
function_name = "florence2_object_detection"
|
307
301
|
|
@@ -309,13 +303,12 @@ def od_sam2_video_tracking(
|
|
309
303
|
segment_results = agentic_object_detection(
|
310
304
|
prompt=prompt,
|
311
305
|
image=segment_frames[frame_number],
|
312
|
-
fine_tune_id=fine_tune_id,
|
313
306
|
)
|
314
307
|
function_name = "agentic_object_detection"
|
315
308
|
|
316
309
|
elif od_model == ODModels.CUSTOM:
|
317
310
|
segment_results = custom_object_detection(
|
318
|
-
deployment_id=
|
311
|
+
deployment_id=deployment_id,
|
319
312
|
image=segment_frames[frame_number],
|
320
313
|
box_threshold=box_threshold,
|
321
314
|
)
|
@@ -337,7 +330,7 @@ def od_sam2_video_tracking(
|
|
337
330
|
segment_frames=segment,
|
338
331
|
od_model=od_model,
|
339
332
|
prompt=prompt,
|
340
|
-
|
333
|
+
deployment_id=deployment_id,
|
341
334
|
chunk_length=chunk_length,
|
342
335
|
image_size=image_size,
|
343
336
|
segment_index=segment_index,
|
@@ -376,7 +369,6 @@ def _owlv2_object_detection(
|
|
376
369
|
box_threshold: float,
|
377
370
|
image_size: Tuple[int, ...],
|
378
371
|
image_bytes: Optional[bytes] = None,
|
379
|
-
fine_tune_id: Optional[str] = None,
|
380
372
|
) -> Dict[str, Any]:
|
381
373
|
if image_bytes is None:
|
382
374
|
image_bytes = numpy_to_bytes(image)
|
@@ -389,21 +381,6 @@ def _owlv2_object_detection(
|
|
389
381
|
}
|
390
382
|
metadata = {"function_name": "owlv2_object_detection"}
|
391
383
|
|
392
|
-
if fine_tune_id is not None:
|
393
|
-
landing_api = LandingPublicAPI()
|
394
|
-
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
395
|
-
if status is not JobStatus.SUCCEEDED:
|
396
|
-
raise FineTuneModelIsNotReady(
|
397
|
-
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
398
|
-
)
|
399
|
-
|
400
|
-
# we can only execute fine-tuned models with florence2
|
401
|
-
payload = {
|
402
|
-
"prompts": payload["prompts"],
|
403
|
-
"jobId": fine_tune_id,
|
404
|
-
"model": "florence2",
|
405
|
-
}
|
406
|
-
|
407
384
|
detections = send_task_inference_request(
|
408
385
|
payload,
|
409
386
|
"text-to-object-detection",
|
@@ -440,7 +417,6 @@ def owlv2_object_detection(
|
|
440
417
|
prompt: str,
|
441
418
|
image: np.ndarray,
|
442
419
|
box_threshold: float = 0.10,
|
443
|
-
fine_tune_id: Optional[str] = None,
|
444
420
|
) -> List[Dict[str, Any]]:
|
445
421
|
"""'owlv2_object_detection' is a tool that can detect and count multiple objects
|
446
422
|
given a text prompt such as category names or referring expressions on images. The
|
@@ -452,8 +428,6 @@ def owlv2_object_detection(
|
|
452
428
|
image (np.ndarray): The image to ground the prompt to.
|
453
429
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
454
430
|
to 0.10.
|
455
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
456
|
-
fine-tuned model ID here to use it.
|
457
431
|
|
458
432
|
Returns:
|
459
433
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -475,9 +449,7 @@ def owlv2_object_detection(
|
|
475
449
|
if image_size[0] < 1 or image_size[1] < 1:
|
476
450
|
return []
|
477
451
|
|
478
|
-
ret = _owlv2_object_detection(
|
479
|
-
prompt, image, box_threshold, image_size, fine_tune_id=fine_tune_id
|
480
|
-
)
|
452
|
+
ret = _owlv2_object_detection(prompt, image, box_threshold, image_size)
|
481
453
|
|
482
454
|
_display_tool_trace(
|
483
455
|
owlv2_object_detection.__name__,
|
@@ -556,7 +528,6 @@ def owlv2_sam2_video_tracking(
|
|
556
528
|
frames: List[np.ndarray],
|
557
529
|
box_threshold: float = 0.10,
|
558
530
|
chunk_length: Optional[int] = 25,
|
559
|
-
fine_tune_id: Optional[str] = None,
|
560
531
|
) -> List[List[Dict[str, Any]]]:
|
561
532
|
"""'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
562
533
|
objects in a video given a text prompt such as category names or referring
|
@@ -571,8 +542,6 @@ def owlv2_sam2_video_tracking(
|
|
571
542
|
to 0.10.
|
572
543
|
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
573
544
|
new objects.
|
574
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
575
|
-
fine-tuned model ID here to use it.
|
576
545
|
|
577
546
|
Returns:
|
578
547
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -609,7 +578,6 @@ def owlv2_sam2_video_tracking(
|
|
609
578
|
frames=frames,
|
610
579
|
box_threshold=box_threshold,
|
611
580
|
chunk_length=chunk_length,
|
612
|
-
fine_tune_id=fine_tune_id,
|
613
581
|
)
|
614
582
|
_display_tool_trace(
|
615
583
|
owlv2_sam2_video_tracking.__name__,
|
@@ -624,7 +592,8 @@ def owlv2_sam2_video_tracking(
|
|
624
592
|
|
625
593
|
|
626
594
|
def florence2_object_detection(
|
627
|
-
prompt: str,
|
595
|
+
prompt: str,
|
596
|
+
image: np.ndarray,
|
628
597
|
) -> List[Dict[str, Any]]:
|
629
598
|
"""'florence2_object_detection' is a tool that can detect multiple objects given a
|
630
599
|
text prompt which can be object names or caption. You can optionally separate the
|
@@ -635,8 +604,6 @@ def florence2_object_detection(
|
|
635
604
|
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
636
605
|
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
637
606
|
image (np.ndarray): The image to used to detect objects
|
638
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
639
|
-
fine-tuned model ID here to use it.
|
640
607
|
|
641
608
|
Returns:
|
642
609
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -653,6 +620,7 @@ def florence2_object_detection(
|
|
653
620
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
654
621
|
]
|
655
622
|
"""
|
623
|
+
|
656
624
|
image_size = image.shape[:2]
|
657
625
|
if image_size[0] < 1 or image_size[1] < 1:
|
658
626
|
return []
|
@@ -665,16 +633,6 @@ def florence2_object_detection(
|
|
665
633
|
}
|
666
634
|
metadata = {"function_name": "florence2_object_detection"}
|
667
635
|
|
668
|
-
if fine_tune_id is not None:
|
669
|
-
landing_api = LandingPublicAPI()
|
670
|
-
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
671
|
-
if status is not JobStatus.SUCCEEDED:
|
672
|
-
raise FineTuneModelIsNotReady(
|
673
|
-
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
674
|
-
)
|
675
|
-
|
676
|
-
payload["jobId"] = fine_tune_id
|
677
|
-
|
678
636
|
detections = send_task_inference_request(
|
679
637
|
payload,
|
680
638
|
"text-to-object-detection",
|
@@ -703,7 +661,8 @@ def florence2_object_detection(
|
|
703
661
|
|
704
662
|
|
705
663
|
def florence2_sam2_instance_segmentation(
|
706
|
-
prompt: str,
|
664
|
+
prompt: str,
|
665
|
+
image: np.ndarray,
|
707
666
|
) -> List[Dict[str, Any]]:
|
708
667
|
"""'florence2_sam2_instance_segmentation' is a tool that can segment multiple
|
709
668
|
objects given a text prompt such as category names or referring expressions. The
|
@@ -715,8 +674,6 @@ def florence2_sam2_instance_segmentation(
|
|
715
674
|
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
716
675
|
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
717
676
|
image (np.ndarray): The image to ground the prompt to.
|
718
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
719
|
-
fine-tuned model ID here to use it.
|
720
677
|
|
721
678
|
Returns:
|
722
679
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
@@ -742,6 +699,7 @@ def florence2_sam2_instance_segmentation(
|
|
742
699
|
},
|
743
700
|
]
|
744
701
|
"""
|
702
|
+
|
745
703
|
if image.shape[0] < 1 or image.shape[1] < 1:
|
746
704
|
return []
|
747
705
|
|
@@ -753,16 +711,6 @@ def florence2_sam2_instance_segmentation(
|
|
753
711
|
}
|
754
712
|
metadata = {"function_name": "florence2_sam2_instance_segmentation"}
|
755
713
|
|
756
|
-
if fine_tune_id is not None:
|
757
|
-
landing_api = LandingPublicAPI()
|
758
|
-
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
759
|
-
if status is not JobStatus.SUCCEEDED:
|
760
|
-
raise FineTuneModelIsNotReady(
|
761
|
-
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
762
|
-
)
|
763
|
-
|
764
|
-
payload["jobId"] = fine_tune_id
|
765
|
-
|
766
714
|
detections = send_task_inference_request(
|
767
715
|
payload,
|
768
716
|
"text-to-instance-segmentation",
|
@@ -792,7 +740,6 @@ def florence2_sam2_video_tracking(
|
|
792
740
|
prompt: str,
|
793
741
|
frames: List[np.ndarray],
|
794
742
|
chunk_length: Optional[int] = 25,
|
795
|
-
fine_tune_id: Optional[str] = None,
|
796
743
|
) -> List[List[Dict[str, Any]]]:
|
797
744
|
"""'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
798
745
|
objects in a video given a text prompt such as category names or referring
|
@@ -806,8 +753,6 @@ def florence2_sam2_video_tracking(
|
|
806
753
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
807
754
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
808
755
|
new objects.
|
809
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
810
|
-
fine-tuned model ID here to use it.
|
811
756
|
|
812
757
|
Returns:
|
813
758
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -837,6 +782,7 @@ def florence2_sam2_video_tracking(
|
|
837
782
|
...
|
838
783
|
]
|
839
784
|
"""
|
785
|
+
|
840
786
|
if len(frames) == 0 or not isinstance(frames, List):
|
841
787
|
raise ValueError("Must provide a list of numpy arrays for frames")
|
842
788
|
|
@@ -851,16 +797,6 @@ def florence2_sam2_video_tracking(
|
|
851
797
|
if chunk_length is not None:
|
852
798
|
payload["chunk_length_frames"] = chunk_length # type: ignore
|
853
799
|
|
854
|
-
if fine_tune_id is not None:
|
855
|
-
landing_api = LandingPublicAPI()
|
856
|
-
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
857
|
-
if status is not JobStatus.SUCCEEDED:
|
858
|
-
raise FineTuneModelIsNotReady(
|
859
|
-
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
860
|
-
)
|
861
|
-
|
862
|
-
payload["jobId"] = fine_tune_id
|
863
|
-
|
864
800
|
detections = send_task_inference_request(
|
865
801
|
payload,
|
866
802
|
"text-to-instance-segmentation",
|
@@ -1397,7 +1333,7 @@ def custom_od_sam2_video_tracking(
|
|
1397
1333
|
prompt="",
|
1398
1334
|
frames=frames,
|
1399
1335
|
chunk_length=chunk_length,
|
1400
|
-
|
1336
|
+
deployment_id=deployment_id,
|
1401
1337
|
)
|
1402
1338
|
_display_tool_trace(
|
1403
1339
|
custom_od_sam2_video_tracking.__name__,
|
@@ -1416,7 +1352,6 @@ def _agentic_object_detection(
|
|
1416
1352
|
image: np.ndarray,
|
1417
1353
|
image_size: Tuple[int, ...],
|
1418
1354
|
image_bytes: Optional[bytes] = None,
|
1419
|
-
fine_tune_id: Optional[str] = None,
|
1420
1355
|
) -> Dict[str, Any]:
|
1421
1356
|
if image_bytes is None:
|
1422
1357
|
image_bytes = numpy_to_bytes(image)
|
@@ -1428,21 +1363,6 @@ def _agentic_object_detection(
|
|
1428
1363
|
}
|
1429
1364
|
metadata = {"function_name": "agentic_object_detection"}
|
1430
1365
|
|
1431
|
-
if fine_tune_id is not None:
|
1432
|
-
landing_api = LandingPublicAPI()
|
1433
|
-
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
1434
|
-
if status is not JobStatus.SUCCEEDED:
|
1435
|
-
raise FineTuneModelIsNotReady(
|
1436
|
-
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
1437
|
-
)
|
1438
|
-
|
1439
|
-
# we can only execute fine-tuned models with florence2
|
1440
|
-
payload = {
|
1441
|
-
"prompts": payload["prompts"],
|
1442
|
-
"jobId": fine_tune_id,
|
1443
|
-
"model": "florence2",
|
1444
|
-
}
|
1445
|
-
|
1446
1366
|
detections = send_task_inference_request(
|
1447
1367
|
payload,
|
1448
1368
|
"text-to-object-detection",
|
@@ -1478,7 +1398,6 @@ def _agentic_object_detection(
|
|
1478
1398
|
def agentic_object_detection(
|
1479
1399
|
prompt: str,
|
1480
1400
|
image: np.ndarray,
|
1481
|
-
fine_tune_id: Optional[str] = None,
|
1482
1401
|
) -> List[Dict[str, Any]]:
|
1483
1402
|
"""'agentic_object_detection' is a tool that can detect multiple objects given a
|
1484
1403
|
text prompt such as object names or referring expressions on images. It's
|
@@ -1490,8 +1409,6 @@ def agentic_object_detection(
|
|
1490
1409
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
1491
1410
|
with no commas or periods.
|
1492
1411
|
image (np.ndarray): The image to ground the prompt to.
|
1493
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
1494
|
-
fine-tuned model ID here to use it.
|
1495
1412
|
|
1496
1413
|
Returns:
|
1497
1414
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -1513,9 +1430,7 @@ def agentic_object_detection(
|
|
1513
1430
|
if image_size[0] < 1 or image_size[1] < 1:
|
1514
1431
|
return []
|
1515
1432
|
|
1516
|
-
ret = _agentic_object_detection(
|
1517
|
-
prompt, image, image_size, fine_tune_id=fine_tune_id
|
1518
|
-
)
|
1433
|
+
ret = _agentic_object_detection(prompt, image, image_size)
|
1519
1434
|
|
1520
1435
|
_display_tool_trace(
|
1521
1436
|
agentic_object_detection.__name__,
|
@@ -1586,7 +1501,6 @@ def agentic_sam2_video_tracking(
|
|
1586
1501
|
prompt: str,
|
1587
1502
|
frames: List[np.ndarray],
|
1588
1503
|
chunk_length: Optional[int] = 25,
|
1589
|
-
fine_tune_id: Optional[str] = None,
|
1590
1504
|
) -> List[List[Dict[str, Any]]]:
|
1591
1505
|
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
1592
1506
|
objects in a video given a text prompt such as object names or referring
|
@@ -1601,8 +1515,6 @@ def agentic_sam2_video_tracking(
|
|
1601
1515
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
1602
1516
|
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
1603
1517
|
to find new objects.
|
1604
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
1605
|
-
fine-tuned model ID here to use it.
|
1606
1518
|
|
1607
1519
|
Returns:
|
1608
1520
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -1638,7 +1550,6 @@ def agentic_sam2_video_tracking(
|
|
1638
1550
|
prompt=prompt,
|
1639
1551
|
frames=frames,
|
1640
1552
|
chunk_length=chunk_length,
|
1641
|
-
fine_tune_id=fine_tune_id,
|
1642
1553
|
)
|
1643
1554
|
_display_tool_trace(
|
1644
1555
|
agentic_sam2_video_tracking.__name__,
|
@@ -2797,16 +2708,17 @@ def save_video(
|
|
2797
2708
|
):
|
2798
2709
|
raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
|
2799
2710
|
|
2711
|
+
output_file: IO[bytes]
|
2800
2712
|
if output_video_path is None:
|
2801
|
-
|
2802
|
-
delete=False, suffix=".mp4"
|
2803
|
-
).name
|
2713
|
+
output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
2804
2714
|
else:
|
2805
2715
|
Path(output_video_path).parent.mkdir(parents=True, exist_ok=True)
|
2716
|
+
output_file = open(output_video_path, "wb")
|
2806
2717
|
|
2807
|
-
|
2808
|
-
|
2809
|
-
|
2718
|
+
with output_file as file:
|
2719
|
+
video_writer(frames, fps, file=file)
|
2720
|
+
_save_video_to_result(output_file.name)
|
2721
|
+
return output_file.name
|
2810
2722
|
|
2811
2723
|
|
2812
2724
|
def _save_video_to_result(video_uri: str) -> None:
|
@@ -51,13 +51,6 @@ class RemoteSandboxClosedError(RemoteSandboxError):
|
|
51
51
|
is_retryable = True
|
52
52
|
|
53
53
|
|
54
|
-
class FineTuneModelIsNotReady(Exception):
|
55
|
-
"""Exception raised when the fine-tune model is not ready.
|
56
|
-
If this is raised, it's recommended to wait 5 seconds before trying to use
|
57
|
-
the model again.
|
58
|
-
"""
|
59
|
-
|
60
|
-
|
61
54
|
class FineTuneModelNotFound(Exception):
|
62
55
|
"""Exception raised when the fine-tune model is not found.
|
63
56
|
If this is raised, it's recommended to try another model id.
|
@@ -1,8 +1,7 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
2
|
import tempfile
|
4
3
|
from functools import lru_cache
|
5
|
-
from typing import List, Optional, Tuple
|
4
|
+
from typing import IO, List, Optional, Tuple
|
6
5
|
|
7
6
|
import av # type: ignore
|
8
7
|
import cv2
|
@@ -25,39 +24,32 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray:
|
|
25
24
|
def video_writer(
|
26
25
|
frames: List[np.ndarray],
|
27
26
|
fps: float = _DEFAULT_INPUT_FPS,
|
28
|
-
|
29
|
-
file_ext: str = ".mp4",
|
27
|
+
file: Optional[IO[bytes]] = None,
|
30
28
|
) -> str:
|
31
|
-
tempf = None
|
32
29
|
if isinstance(fps, str):
|
33
30
|
# fps could be a string when it's passed in from a web endpoint deployment
|
34
31
|
fps = float(fps)
|
35
|
-
if
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
32
|
+
if file is None:
|
33
|
+
file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
34
|
+
with av.open(file, "w") as container:
|
35
|
+
stream = container.add_stream("h264", rate=fps)
|
36
|
+
height, width = frames[0].shape[:2]
|
37
|
+
stream.height = height - (height % 2)
|
38
|
+
stream.width = width - (width % 2)
|
39
|
+
stream.pix_fmt = "yuv420p"
|
40
|
+
stream.options = {"crf": "10"}
|
41
|
+
for frame in frames:
|
42
|
+
# Remove the alpha channel (convert RGBA to RGB)
|
43
|
+
frame_rgb = frame[:, :, :3]
|
44
|
+
# Resize the frame to make dimensions divisible by 2
|
45
|
+
frame_rgb = _resize_frame(frame_rgb)
|
46
|
+
av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
|
47
|
+
for packet in stream.encode(av_frame):
|
48
|
+
container.mux(packet)
|
49
|
+
|
50
|
+
for packet in stream.encode():
|
52
51
|
container.mux(packet)
|
53
|
-
|
54
|
-
for packet in stream.encode():
|
55
|
-
container.mux(packet)
|
56
|
-
container.close()
|
57
|
-
# for windows nee to manually close tempfile, cannot use with NamedTemporaryFile(delete=True)
|
58
|
-
if tempf is not None:
|
59
|
-
tempf.close()
|
60
|
-
return filename
|
52
|
+
return file.name
|
61
53
|
|
62
54
|
|
63
55
|
def frames_to_bytes(
|
@@ -73,11 +65,10 @@ def frames_to_bytes(
|
|
73
65
|
if isinstance(fps, str):
|
74
66
|
# fps could be a string when it's passed in from a web endpoint deployment
|
75
67
|
fps = float(fps)
|
76
|
-
|
77
|
-
|
78
|
-
|
68
|
+
with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as f:
|
69
|
+
video_writer(frames, fps, f)
|
70
|
+
f.seek(0)
|
79
71
|
buffer_bytes = f.read()
|
80
|
-
os.unlink(filename)
|
81
72
|
return buffer_bytes
|
82
73
|
|
83
74
|
|
@@ -54,7 +54,7 @@ def process_segment(
|
|
54
54
|
segment_frames: List[np.ndarray],
|
55
55
|
od_model: ODModels,
|
56
56
|
prompt: str,
|
57
|
-
|
57
|
+
deployment_id: Optional[str],
|
58
58
|
chunk_length: Optional[int],
|
59
59
|
image_size: Tuple[int, ...],
|
60
60
|
segment_index: int,
|
@@ -67,7 +67,7 @@ def process_segment(
|
|
67
67
|
segment_frames (List[np.ndarray]): Frames in the segment.
|
68
68
|
od_model (ODModels): Object detection model to use.
|
69
69
|
prompt (str): Prompt for the model.
|
70
|
-
|
70
|
+
deployment_id (Optional[str]): The model deployment ID.
|
71
71
|
chunk_length (Optional[int]): Chunk length for processing.
|
72
72
|
image_size (Tuple[int, int]): Size of the images.
|
73
73
|
segment_index (int): Index of the segment.
|
@@ -90,7 +90,12 @@ def process_segment(
|
|
90
90
|
for idx in range(0, len(segment_frames), step):
|
91
91
|
frame_number = idx
|
92
92
|
segment_results[idx], function_name = object_detection_tool(
|
93
|
-
|
93
|
+
deployment_id=deployment_id,
|
94
|
+
frame_number=frame_number,
|
95
|
+
od_model=od_model,
|
96
|
+
prompt=prompt,
|
97
|
+
segment_frames=segment_frames,
|
98
|
+
segment_index=segment_index,
|
94
99
|
)
|
95
100
|
|
96
101
|
transformed_detections = transform_detections(
|
@@ -1,38 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from typing import List
|
3
|
-
from uuid import UUID
|
4
|
-
|
5
|
-
from requests.exceptions import HTTPError
|
6
|
-
|
7
|
-
from vision_agent.clients.http import BaseHTTP
|
8
|
-
from vision_agent.models import BboxInputBase64, JobStatus, PromptTask
|
9
|
-
from vision_agent.utils.exceptions import FineTuneModelNotFound
|
10
|
-
from vision_agent.utils.type_defs import LandingaiAPIKey
|
11
|
-
|
12
|
-
|
13
|
-
class LandingPublicAPI(BaseHTTP):
|
14
|
-
def __init__(self) -> None:
|
15
|
-
landing_url = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
|
16
|
-
landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
|
17
|
-
headers = {"Content-Type": "application/json", "apikey": landing_api_key}
|
18
|
-
super().__init__(base_endpoint=landing_url, headers=headers)
|
19
|
-
|
20
|
-
def launch_fine_tuning_job(
|
21
|
-
self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
|
22
|
-
) -> UUID:
|
23
|
-
url = "v1/agent/jobs/fine-tuning"
|
24
|
-
data = {
|
25
|
-
"model": {"name": model_name, "task": task.value},
|
26
|
-
"bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
|
27
|
-
}
|
28
|
-
response = self.post(url, payload=data)
|
29
|
-
return UUID(response["jobId"])
|
30
|
-
|
31
|
-
def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
|
32
|
-
url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
|
33
|
-
try:
|
34
|
-
get_job = self.get(url)
|
35
|
-
except HTTPError as err:
|
36
|
-
if err.response.status_code == 404:
|
37
|
-
raise FineTuneModelNotFound()
|
38
|
-
return JobStatus(get_job["status"])
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.239 → vision_agent-0.2.241}/vision_agent/configs/anthropic_openai_config.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|