vision-agent 0.2.240__py3-none-any.whl → 0.2.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
5
5
  from vision_agent.models import (
6
6
  AgentMessage,
7
7
  CodeContext,
8
+ ErrorContext,
8
9
  InteractionContext,
9
10
  Message,
10
11
  PlanContext,
@@ -36,7 +37,7 @@ class AgentCoder(Agent):
36
37
  chat: List[AgentMessage],
37
38
  max_steps: Optional[int] = None,
38
39
  code_interpreter: Optional[CodeInterpreter] = None,
39
- ) -> Union[CodeContext, InteractionContext]:
40
+ ) -> Union[CodeContext, InteractionContext, ErrorContext]:
40
41
  pass
41
42
 
42
43
  @abstractmethod
@@ -56,5 +57,5 @@ class AgentPlanner(Agent):
56
57
  chat: List[AgentMessage],
57
58
  max_steps: Optional[int] = None,
58
59
  code_interpreter: Optional[CodeInterpreter] = None,
59
- ) -> Union[PlanContext, InteractionContext]:
60
+ ) -> Union[PlanContext, InteractionContext, ErrorContext]:
60
61
  pass
@@ -13,6 +13,7 @@ from vision_agent.lmm import LMM
13
13
  from vision_agent.models import (
14
14
  AgentMessage,
15
15
  CodeContext,
16
+ ErrorContext,
16
17
  InteractionContext,
17
18
  Message,
18
19
  PlanContext,
@@ -365,6 +366,8 @@ class VisionAgentCoderV2(AgentCoder):
365
366
  code_or_interaction = self.generate_code(input_msg)
366
367
  if isinstance(code_or_interaction, InteractionContext):
367
368
  return code_or_interaction.chat[-1].content
369
+ elif isinstance(code_or_interaction, ErrorContext):
370
+ return code_or_interaction.error
368
371
  return code_or_interaction.code
369
372
 
370
373
  def generate_code(
@@ -372,7 +375,7 @@ class VisionAgentCoderV2(AgentCoder):
372
375
  chat: List[AgentMessage],
373
376
  max_steps: Optional[int] = None,
374
377
  code_interpreter: Optional[CodeInterpreter] = None,
375
- ) -> Union[CodeContext, InteractionContext]:
378
+ ) -> Union[CodeContext, InteractionContext, ErrorContext]:
376
379
  """Generate vision code from a conversation.
377
380
 
378
381
  Parameters:
@@ -404,6 +407,8 @@ class VisionAgentCoderV2(AgentCoder):
404
407
  # the planner needs an interaction, so return before generating code
405
408
  if isinstance(plan_context, InteractionContext):
406
409
  return plan_context
410
+ elif isinstance(plan_context, ErrorContext):
411
+ return plan_context
407
412
 
408
413
  code_context = self.generate_code_from_plan(
409
414
  orig_chat,
@@ -24,7 +24,13 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
24
24
  )
25
25
  from vision_agent.configs import Config
26
26
  from vision_agent.lmm import LMM
27
- from vision_agent.models import AgentMessage, InteractionContext, Message, PlanContext
27
+ from vision_agent.models import (
28
+ AgentMessage,
29
+ ErrorContext,
30
+ InteractionContext,
31
+ Message,
32
+ PlanContext,
33
+ )
28
34
  from vision_agent.tools.planner_tools import check_function_call
29
35
  from vision_agent.utils.agent import (
30
36
  add_media_to_chat,
@@ -322,7 +328,7 @@ def create_finalize_plan(
322
328
  model: LMM,
323
329
  chat: List[AgentMessage],
324
330
  verbose: bool = False,
325
- ) -> Tuple[List[AgentMessage], PlanContext]:
331
+ ) -> Tuple[List[AgentMessage], Union[PlanContext, ErrorContext]]:
326
332
  # if we're in the middle of an interaction, don't finalize the plan
327
333
  if chat[-1].role == "interaction":
328
334
  return [], PlanContext(plan="", instructions=[], code="")
@@ -337,11 +343,19 @@ def create_finalize_plan(
337
343
  return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
338
344
 
339
345
  plan_json = extract_tag(plan_str, "json")
340
- plan = (
341
- extract_json(plan_json)
342
- if plan_json is not None
343
- else {"plan": plan_str, "instructions": [], "code": ""}
344
- )
346
+
347
+ # sometimes the planner model will refuse to answer a question becuase of some
348
+ # safety concern, we then wont be able to parse the response so we have to send
349
+ # it back to the user/conversation agent
350
+ try:
351
+ plan = (
352
+ extract_json(plan_json)
353
+ if plan_json is not None
354
+ else {"plan": plan_str, "instructions": [], "code": ""}
355
+ )
356
+ except json.JSONDecodeError:
357
+ return return_chat, ErrorContext(error=plan_str)
358
+
345
359
  code_snippets = extract_tag(plan_str, "code")
346
360
  plan["code"] = code_snippets if code_snippets is not None else ""
347
361
  if verbose:
@@ -473,14 +487,17 @@ class VisionAgentPlannerV2(AgentPlanner):
473
487
  plan_or_interaction = self.generate_plan(input_msg)
474
488
  if isinstance(plan_or_interaction, InteractionContext):
475
489
  return plan_or_interaction.chat[-1].content
476
- return plan_or_interaction.plan
490
+ elif isinstance(plan_or_interaction, PlanContext):
491
+ return plan_or_interaction.plan
492
+ else:
493
+ return plan_or_interaction.error
477
494
 
478
495
  def generate_plan(
479
496
  self,
480
497
  chat: List[AgentMessage],
481
498
  max_steps: Optional[int] = None,
482
499
  code_interpreter: Optional[CodeInterpreter] = None,
483
- ) -> Union[PlanContext, InteractionContext]:
500
+ ) -> Union[PlanContext, InteractionContext, ErrorContext]:
484
501
  """Generate a plan to solve a vision task.
485
502
 
486
503
  Parameters:
@@ -571,7 +588,7 @@ class VisionAgentPlannerV2(AgentPlanner):
571
588
  for chat_elt in updated_chat:
572
589
  self.update_callback(chat_elt.model_dump())
573
590
 
574
- context: Union[PlanContext, InteractionContext]
591
+ context: Union[PlanContext, InteractionContext, ErrorContext]
575
592
  if interaction:
576
593
  context = InteractionContext(chat=int_chat)
577
594
  else:
@@ -16,17 +16,29 @@ AGENT: <response>Yes, I can help you with that. I will write the code to detect
16
16
  OBSERVATION:
17
17
  <final_code>
18
18
  from vision_agent.tools import load_image, owl_v2_image
19
- def detect_dogs(image_path: str):
19
+ def detect_dogs(image_path: str) -> int:
20
20
  image = load_image(image_path)
21
21
  dogs = owl_v2_image(image)
22
- return dogs
22
+ return len(dogs)
23
23
  </final_code>
24
24
  <final_test>
25
25
  def test_detect_dogs():
26
26
  dogs = detect_dogs("images/dogs.jpg")
27
- assert len(dogs) > 0
27
+ assert isinstance(dogs, int)
28
+ print(f"Number of dogs detected: {{dogs}}")
29
+ return dogs
28
30
  </final_test>
29
31
 
32
+ OBSERVATION: ----- stdout -----
33
+ Number of dogs detected: 8
34
+
35
+ ----- stderr -----
36
+
37
+ ----- Intermediate output-----
38
+ None
39
+ ----- Final output -----
40
+ 8
41
+
30
42
  AGENT: <response>Here is the code to detect dogs in the image.</response>
31
43
  --- END EXAMPLE1 ---
32
44
 
@@ -11,6 +11,7 @@ from vision_agent.lmm import LMM
11
11
  from vision_agent.models import (
12
12
  AgentMessage,
13
13
  CodeContext,
14
+ ErrorContext,
14
15
  InteractionContext,
15
16
  Message,
16
17
  PlanContext,
@@ -27,7 +28,9 @@ CONFIG = Config()
27
28
 
28
29
 
29
30
  def extract_conversation(
30
- chat: List[AgentMessage], include_conv: bool = False
31
+ chat: List[AgentMessage],
32
+ include_conv: bool = False,
33
+ include_errors: bool = False,
31
34
  ) -> Tuple[List[AgentMessage], Optional[str]]:
32
35
  chat = copy.deepcopy(chat)
33
36
 
@@ -43,13 +46,18 @@ def extract_conversation(
43
46
  elif chat_i.role == "coder":
44
47
  if "<final_code>" in chat_i.content:
45
48
  extracted_chat.append(chat_i)
49
+ elif chat_i.role == "final_observation":
50
+ extracted_chat.append(chat_i)
46
51
  elif include_conv and chat_i.role == "conversation":
47
52
  extracted_chat.append(chat_i)
53
+ elif include_errors and chat_i.role == "error_observation":
54
+ extracted_chat.append(chat_i)
48
55
 
49
- # only keep the last <final_code> and <final_test>
56
+ # only keep the last <final_code>, <final_test>
50
57
  final_code = None
51
58
  extracted_chat_strip_code: List[AgentMessage] = []
52
- for chat_i in reversed(extracted_chat):
59
+ for chat_i in reversed((extracted_chat)):
60
+ # don't check role here because user could send updated <final_code>
53
61
  if "<final_code>" in chat_i.content and final_code is None:
54
62
  extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
55
63
  final_code = extract_tag(chat_i.content, "final_code")
@@ -66,7 +74,12 @@ def extract_conversation(
66
74
 
67
75
 
68
76
  def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
69
- extracted_chat, _ = extract_conversation(chat, include_conv=True)
77
+ # Include conversation and error messages. The error messages can come from one of
78
+ # the agents refusing to write a correctly formatted message, want to inform the
79
+ # conversation agent of this.
80
+ extracted_chat, _ = extract_conversation(
81
+ chat, include_conv=True, include_errors=True
82
+ )
70
83
 
71
84
  conv = format_conversation(extracted_chat)
72
85
  prompt = CONVERSATION.format(
@@ -101,7 +114,9 @@ def maybe_run_action(
101
114
  if isinstance(context, CodeContext):
102
115
  return [
103
116
  AgentMessage(role="coder", content=format_code_context(context)),
104
- AgentMessage(role="observation", content=context.test_result.text()),
117
+ AgentMessage(
118
+ role="final_observation", content=context.test_result.text()
119
+ ),
105
120
  ]
106
121
  elif isinstance(context, InteractionContext):
107
122
  return [
@@ -110,6 +125,10 @@ def maybe_run_action(
110
125
  content=json.dumps([elt.model_dump() for elt in context.chat]),
111
126
  )
112
127
  ]
128
+ elif isinstance(context, ErrorContext):
129
+ return [
130
+ AgentMessage(role="error_observation", content=context.error),
131
+ ]
113
132
  elif action == "edit_code":
114
133
  # We don't want to pass code in plan_context.code so the coder will generate
115
134
  # new code from plan_context.plan
@@ -129,7 +148,7 @@ def maybe_run_action(
129
148
  )
130
149
  return [
131
150
  AgentMessage(role="coder", content=format_code_context(context)),
132
- AgentMessage(role="observation", content=context.test_result.text()),
151
+ AgentMessage(role="final_observation", content=context.test_result.text()),
133
152
  ]
134
153
  elif action == "view_image":
135
154
  pass
@@ -1,4 +1,10 @@
1
- from .agent_types import AgentMessage, CodeContext, InteractionContext, PlanContext
1
+ from .agent_types import (
2
+ AgentMessage,
3
+ CodeContext,
4
+ ErrorContext,
5
+ InteractionContext,
6
+ PlanContext,
7
+ )
2
8
  from .lmm_types import Message, TextOrImage
3
9
  from .tools_types import (
4
10
  BboxInput,
@@ -29,11 +29,15 @@ class AgentMessage(BaseModel):
29
29
  Literal["user"],
30
30
  Literal["assistant"], # planner, coder and conversation are of type assistant
31
31
  Literal["observation"],
32
+ Literal["final_observation"], # the observation from the final code output
33
+ Literal["error_observation"], # the observation from the error message
32
34
  Literal["interaction"],
33
35
  Literal["interaction_response"],
34
36
  Literal["conversation"],
35
37
  Literal["planner"],
36
- Literal["planner_update"],
38
+ Literal[
39
+ "planner_update"
40
+ ], # an intermediate update from the planner to show partial information
37
41
  Literal["coder"],
38
42
  ]
39
43
  content: str
@@ -75,3 +79,14 @@ class InteractionContext(BaseModel):
75
79
  """
76
80
 
77
81
  chat: List[AgentMessage]
82
+
83
+
84
+ class ErrorContext(BaseModel):
85
+ """ErrorContext is a data model that represents an error message. These errors can
86
+ happen in the planning phase when a model does not output correctly formatted
87
+ messages (often because it considers some response to be a safety issue).
88
+
89
+ error: The error message.
90
+ """
91
+
92
+ error: str
@@ -7,9 +7,7 @@ from .meta_tools import (
7
7
  generate_vision_code,
8
8
  get_tool_descriptions,
9
9
  list_artifacts,
10
- object_detection_fine_tuning,
11
10
  open_code_artifact,
12
- use_object_detection_fine_tuning,
13
11
  view_media_artifact,
14
12
  )
15
13
  from .planner_tools import judge_od_results
@@ -11,11 +11,9 @@ import libcst as cst
11
11
  from IPython.display import display
12
12
 
13
13
  import vision_agent as va
14
- from vision_agent.clients.landing_public_api import LandingPublicAPI
15
- from vision_agent.models import BboxInput, BboxInputBase64, Message, PromptTask
14
+ from vision_agent.models import Message
16
15
  from vision_agent.tools.tools import get_tools_descriptions as _get_tool_descriptions
17
16
  from vision_agent.utils.execute import Execution, MimeType
18
- from vision_agent.utils.image_utils import convert_to_b64
19
17
  from vision_agent.utils.tools_doc import get_tool_documentation
20
18
 
21
19
  CURRENT_FILE = None
@@ -573,48 +571,6 @@ def get_tool_descriptions() -> str:
573
571
  return _get_tool_descriptions()
574
572
 
575
573
 
576
- def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
577
- """DO NOT use this function unless the user has supplied you with bboxes.
578
- 'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
579
- be able to detect objects in an image based on a given dataset. It returns the fine
580
- tuning job id.
581
-
582
- Parameters:
583
- bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
584
- and bounding boxes. The coordinates are unnormalized.
585
-
586
- Returns:
587
- str: The fine tuning job id, this id will used to retrieve the fine tuned
588
- model.
589
-
590
- Example
591
- -------
592
- >>> fine_tuning_job_id = object_detection_fine_tuning(
593
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
594
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
595
- "phrase_grounding"
596
- )
597
- """
598
- task = "phrase_grounding"
599
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
600
- task_type = PromptTask[task.upper()]
601
- fine_tuning_request = [
602
- BboxInputBase64(
603
- image=convert_to_b64(bbox_input.image_path),
604
- filename=Path(bbox_input.image_path).name,
605
- labels=bbox_input.labels,
606
- bboxes=bbox_input.bboxes,
607
- )
608
- for bbox_input in bboxes_input
609
- ]
610
- landing_api = LandingPublicAPI()
611
- fine_tune_id = str(
612
- landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
613
- )
614
- print(f"[Fine tuning id: {fine_tune_id}]")
615
- return fine_tune_id
616
-
617
-
618
574
  def get_diff(before: str, after: str) -> str:
619
575
  return "".join(
620
576
  difflib.unified_diff(
@@ -721,83 +677,6 @@ def use_extra_vision_agent_args(
721
677
  return modified_tree.code
722
678
 
723
679
 
724
- def use_object_detection_fine_tuning(
725
- artifacts: Artifacts, name: str, fine_tune_id: str
726
- ) -> str:
727
- """Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
728
- 'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
729
- the fined tuned florence2 model. Returns the diff between the original code and the
730
- new code.
731
-
732
- Parameters:
733
- artifacts (Artifacts): The artifacts object to edit the code from.
734
- name (str): The name of the artifact to edit.
735
- fine_tune_id (str): The fine tuning job id.
736
-
737
- Examples
738
- --------
739
- >>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
740
- """
741
-
742
- if name not in artifacts:
743
- output_str = f"[Artifact {name} does not exist]"
744
- print(output_str)
745
- return output_str
746
-
747
- code = artifacts[name]
748
-
749
- patterns_with_fine_tune_id = [
750
- (
751
- r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
752
- lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
753
- ),
754
- (
755
- r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
756
- lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
757
- ),
758
- (
759
- r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
760
- lambda match: f'owl_v2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
761
- ),
762
- (
763
- r'florence2_sam2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
764
- lambda match: f'florence2_sam2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
765
- ),
766
- ]
767
-
768
- new_code = code
769
- for (
770
- pattern_with_fine_tune_id,
771
- replacer_with_fine_tune_id,
772
- ) in patterns_with_fine_tune_id:
773
- if re.search(pattern_with_fine_tune_id, new_code):
774
- new_code = re.sub(
775
- pattern_with_fine_tune_id, replacer_with_fine_tune_id, new_code
776
- )
777
-
778
- if new_code == code:
779
- output_str = (
780
- f"[No function calls to replace with fine tuning id in artifact {name}]"
781
- )
782
- print(output_str)
783
- return output_str
784
-
785
- artifacts[name] = new_code
786
-
787
- diff = get_diff_with_prompts(name, code, new_code)
788
- print(diff)
789
-
790
- display(
791
- {
792
- MimeType.APPLICATION_ARTIFACT: json.dumps(
793
- {"name": name, "content": new_code, "action": "edit"}
794
- )
795
- },
796
- raw=True,
797
- )
798
- return diff
799
-
800
-
801
680
  META_TOOL_DOCSTRING = get_tool_documentation(
802
681
  [
803
682
  get_tool_descriptions,
@@ -807,8 +686,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
807
686
  generate_vision_code,
808
687
  edit_vision_code,
809
688
  view_media_artifact,
810
- object_detection_fine_tuning,
811
- use_object_detection_fine_tuning,
812
689
  list_artifacts,
813
690
  ]
814
691
  )
@@ -9,7 +9,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from importlib import resources
10
10
  from pathlib import Path
11
11
  from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union, cast
12
- from uuid import UUID
13
12
 
14
13
  import cv2
15
14
  import numpy as np
@@ -20,10 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
20
19
  from pillow_heif import register_heif_opener # type: ignore
21
20
  from pytube import YouTube # type: ignore
22
21
 
23
- from vision_agent.clients.landing_public_api import LandingPublicAPI
24
22
  from vision_agent.lmm.lmm import LMM, AnthropicLMM, OpenAILMM
25
- from vision_agent.models import JobStatus
26
- from vision_agent.utils.exceptions import FineTuneModelIsNotReady
27
23
  from vision_agent.utils.execute import FileSerializer, MimeType
28
24
  from vision_agent.utils.image_utils import (
29
25
  b64_to_pil,
@@ -239,7 +235,7 @@ def od_sam2_video_tracking(
239
235
  frames: List[np.ndarray],
240
236
  box_threshold: float = 0.30,
241
237
  chunk_length: Optional[int] = 50,
242
- fine_tune_id: Optional[str] = None,
238
+ deployment_id: Optional[str] = None,
243
239
  ) -> Dict[str, Any]:
244
240
  chunk_length = 50 if chunk_length is None else chunk_length
245
241
  segment_size = chunk_length
@@ -262,7 +258,7 @@ def od_sam2_video_tracking(
262
258
  prompt: str,
263
259
  segment_index: int,
264
260
  frame_number: int,
265
- fine_tune_id: str,
261
+ deployment_id: str,
266
262
  segment_frames: list,
267
263
  ) -> tuple:
268
264
  """
@@ -273,7 +269,7 @@ def od_sam2_video_tracking(
273
269
  prompt: The prompt for the object detection model.
274
270
  segment_index: The index of the current segment.
275
271
  frame_number: The number of the current frame.
276
- fine_tune_id: Optional fine-tune ID for the model.
272
+ deployment_id: Optional The Model deployment ID.
277
273
  segment_frames: List of frames for the current segment.
278
274
 
279
275
  Returns:
@@ -293,7 +289,6 @@ def od_sam2_video_tracking(
293
289
  prompt=prompt,
294
290
  image=segment_frames[frame_number],
295
291
  box_threshold=box_threshold,
296
- fine_tune_id=fine_tune_id,
297
292
  )
298
293
  function_name = "owlv2_object_detection"
299
294
 
@@ -301,7 +296,6 @@ def od_sam2_video_tracking(
301
296
  segment_results = florence2_object_detection(
302
297
  prompt=prompt,
303
298
  image=segment_frames[frame_number],
304
- fine_tune_id=fine_tune_id,
305
299
  )
306
300
  function_name = "florence2_object_detection"
307
301
 
@@ -309,13 +303,12 @@ def od_sam2_video_tracking(
309
303
  segment_results = agentic_object_detection(
310
304
  prompt=prompt,
311
305
  image=segment_frames[frame_number],
312
- fine_tune_id=fine_tune_id,
313
306
  )
314
307
  function_name = "agentic_object_detection"
315
308
 
316
309
  elif od_model == ODModels.CUSTOM:
317
310
  segment_results = custom_object_detection(
318
- deployment_id=fine_tune_id,
311
+ deployment_id=deployment_id,
319
312
  image=segment_frames[frame_number],
320
313
  box_threshold=box_threshold,
321
314
  )
@@ -337,7 +330,7 @@ def od_sam2_video_tracking(
337
330
  segment_frames=segment,
338
331
  od_model=od_model,
339
332
  prompt=prompt,
340
- fine_tune_id=fine_tune_id,
333
+ deployment_id=deployment_id,
341
334
  chunk_length=chunk_length,
342
335
  image_size=image_size,
343
336
  segment_index=segment_index,
@@ -376,7 +369,6 @@ def _owlv2_object_detection(
376
369
  box_threshold: float,
377
370
  image_size: Tuple[int, ...],
378
371
  image_bytes: Optional[bytes] = None,
379
- fine_tune_id: Optional[str] = None,
380
372
  ) -> Dict[str, Any]:
381
373
  if image_bytes is None:
382
374
  image_bytes = numpy_to_bytes(image)
@@ -389,21 +381,6 @@ def _owlv2_object_detection(
389
381
  }
390
382
  metadata = {"function_name": "owlv2_object_detection"}
391
383
 
392
- if fine_tune_id is not None:
393
- landing_api = LandingPublicAPI()
394
- status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
395
- if status is not JobStatus.SUCCEEDED:
396
- raise FineTuneModelIsNotReady(
397
- f"Fine-tuned model {fine_tune_id} is not ready yet"
398
- )
399
-
400
- # we can only execute fine-tuned models with florence2
401
- payload = {
402
- "prompts": payload["prompts"],
403
- "jobId": fine_tune_id,
404
- "model": "florence2",
405
- }
406
-
407
384
  detections = send_task_inference_request(
408
385
  payload,
409
386
  "text-to-object-detection",
@@ -440,7 +417,6 @@ def owlv2_object_detection(
440
417
  prompt: str,
441
418
  image: np.ndarray,
442
419
  box_threshold: float = 0.10,
443
- fine_tune_id: Optional[str] = None,
444
420
  ) -> List[Dict[str, Any]]:
445
421
  """'owlv2_object_detection' is a tool that can detect and count multiple objects
446
422
  given a text prompt such as category names or referring expressions on images. The
@@ -452,8 +428,6 @@ def owlv2_object_detection(
452
428
  image (np.ndarray): The image to ground the prompt to.
453
429
  box_threshold (float, optional): The threshold for the box detection. Defaults
454
430
  to 0.10.
455
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
456
- fine-tuned model ID here to use it.
457
431
 
458
432
  Returns:
459
433
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -475,9 +449,7 @@ def owlv2_object_detection(
475
449
  if image_size[0] < 1 or image_size[1] < 1:
476
450
  return []
477
451
 
478
- ret = _owlv2_object_detection(
479
- prompt, image, box_threshold, image_size, fine_tune_id=fine_tune_id
480
- )
452
+ ret = _owlv2_object_detection(prompt, image, box_threshold, image_size)
481
453
 
482
454
  _display_tool_trace(
483
455
  owlv2_object_detection.__name__,
@@ -556,7 +528,6 @@ def owlv2_sam2_video_tracking(
556
528
  frames: List[np.ndarray],
557
529
  box_threshold: float = 0.10,
558
530
  chunk_length: Optional[int] = 25,
559
- fine_tune_id: Optional[str] = None,
560
531
  ) -> List[List[Dict[str, Any]]]:
561
532
  """'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
562
533
  objects in a video given a text prompt such as category names or referring
@@ -571,8 +542,6 @@ def owlv2_sam2_video_tracking(
571
542
  to 0.10.
572
543
  chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
573
544
  new objects.
574
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
575
- fine-tuned model ID here to use it.
576
545
 
577
546
  Returns:
578
547
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -609,7 +578,6 @@ def owlv2_sam2_video_tracking(
609
578
  frames=frames,
610
579
  box_threshold=box_threshold,
611
580
  chunk_length=chunk_length,
612
- fine_tune_id=fine_tune_id,
613
581
  )
614
582
  _display_tool_trace(
615
583
  owlv2_sam2_video_tracking.__name__,
@@ -624,7 +592,8 @@ def owlv2_sam2_video_tracking(
624
592
 
625
593
 
626
594
  def florence2_object_detection(
627
- prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
595
+ prompt: str,
596
+ image: np.ndarray,
628
597
  ) -> List[Dict[str, Any]]:
629
598
  """'florence2_object_detection' is a tool that can detect multiple objects given a
630
599
  text prompt which can be object names or caption. You can optionally separate the
@@ -635,8 +604,6 @@ def florence2_object_detection(
635
604
  prompt (str): The prompt to ground to the image. Use exclusive categories that
636
605
  do not overlap such as 'person, car' and NOT 'person, athlete'.
637
606
  image (np.ndarray): The image to used to detect objects
638
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
639
- fine-tuned model ID here to use it.
640
607
 
641
608
  Returns:
642
609
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -653,6 +620,7 @@ def florence2_object_detection(
653
620
  {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
654
621
  ]
655
622
  """
623
+
656
624
  image_size = image.shape[:2]
657
625
  if image_size[0] < 1 or image_size[1] < 1:
658
626
  return []
@@ -665,16 +633,6 @@ def florence2_object_detection(
665
633
  }
666
634
  metadata = {"function_name": "florence2_object_detection"}
667
635
 
668
- if fine_tune_id is not None:
669
- landing_api = LandingPublicAPI()
670
- status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
671
- if status is not JobStatus.SUCCEEDED:
672
- raise FineTuneModelIsNotReady(
673
- f"Fine-tuned model {fine_tune_id} is not ready yet"
674
- )
675
-
676
- payload["jobId"] = fine_tune_id
677
-
678
636
  detections = send_task_inference_request(
679
637
  payload,
680
638
  "text-to-object-detection",
@@ -703,7 +661,8 @@ def florence2_object_detection(
703
661
 
704
662
 
705
663
  def florence2_sam2_instance_segmentation(
706
- prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
664
+ prompt: str,
665
+ image: np.ndarray,
707
666
  ) -> List[Dict[str, Any]]:
708
667
  """'florence2_sam2_instance_segmentation' is a tool that can segment multiple
709
668
  objects given a text prompt such as category names or referring expressions. The
@@ -715,8 +674,6 @@ def florence2_sam2_instance_segmentation(
715
674
  prompt (str): The prompt to ground to the image. Use exclusive categories that
716
675
  do not overlap such as 'person, car' and NOT 'person, athlete'.
717
676
  image (np.ndarray): The image to ground the prompt to.
718
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
719
- fine-tuned model ID here to use it.
720
677
 
721
678
  Returns:
722
679
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -742,6 +699,7 @@ def florence2_sam2_instance_segmentation(
742
699
  },
743
700
  ]
744
701
  """
702
+
745
703
  if image.shape[0] < 1 or image.shape[1] < 1:
746
704
  return []
747
705
 
@@ -753,16 +711,6 @@ def florence2_sam2_instance_segmentation(
753
711
  }
754
712
  metadata = {"function_name": "florence2_sam2_instance_segmentation"}
755
713
 
756
- if fine_tune_id is not None:
757
- landing_api = LandingPublicAPI()
758
- status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
759
- if status is not JobStatus.SUCCEEDED:
760
- raise FineTuneModelIsNotReady(
761
- f"Fine-tuned model {fine_tune_id} is not ready yet"
762
- )
763
-
764
- payload["jobId"] = fine_tune_id
765
-
766
714
  detections = send_task_inference_request(
767
715
  payload,
768
716
  "text-to-instance-segmentation",
@@ -792,7 +740,6 @@ def florence2_sam2_video_tracking(
792
740
  prompt: str,
793
741
  frames: List[np.ndarray],
794
742
  chunk_length: Optional[int] = 25,
795
- fine_tune_id: Optional[str] = None,
796
743
  ) -> List[List[Dict[str, Any]]]:
797
744
  """'florence2_sam2_video_tracking' is a tool that can track and segment multiple
798
745
  objects in a video given a text prompt such as category names or referring
@@ -806,8 +753,6 @@ def florence2_sam2_video_tracking(
806
753
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
807
754
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
808
755
  new objects.
809
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
810
- fine-tuned model ID here to use it.
811
756
 
812
757
  Returns:
813
758
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -837,6 +782,7 @@ def florence2_sam2_video_tracking(
837
782
  ...
838
783
  ]
839
784
  """
785
+
840
786
  if len(frames) == 0 or not isinstance(frames, List):
841
787
  raise ValueError("Must provide a list of numpy arrays for frames")
842
788
 
@@ -851,16 +797,6 @@ def florence2_sam2_video_tracking(
851
797
  if chunk_length is not None:
852
798
  payload["chunk_length_frames"] = chunk_length # type: ignore
853
799
 
854
- if fine_tune_id is not None:
855
- landing_api = LandingPublicAPI()
856
- status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
857
- if status is not JobStatus.SUCCEEDED:
858
- raise FineTuneModelIsNotReady(
859
- f"Fine-tuned model {fine_tune_id} is not ready yet"
860
- )
861
-
862
- payload["jobId"] = fine_tune_id
863
-
864
800
  detections = send_task_inference_request(
865
801
  payload,
866
802
  "text-to-instance-segmentation",
@@ -1397,7 +1333,7 @@ def custom_od_sam2_video_tracking(
1397
1333
  prompt="",
1398
1334
  frames=frames,
1399
1335
  chunk_length=chunk_length,
1400
- fine_tune_id=deployment_id,
1336
+ deployment_id=deployment_id,
1401
1337
  )
1402
1338
  _display_tool_trace(
1403
1339
  custom_od_sam2_video_tracking.__name__,
@@ -1416,7 +1352,6 @@ def _agentic_object_detection(
1416
1352
  image: np.ndarray,
1417
1353
  image_size: Tuple[int, ...],
1418
1354
  image_bytes: Optional[bytes] = None,
1419
- fine_tune_id: Optional[str] = None,
1420
1355
  ) -> Dict[str, Any]:
1421
1356
  if image_bytes is None:
1422
1357
  image_bytes = numpy_to_bytes(image)
@@ -1428,21 +1363,6 @@ def _agentic_object_detection(
1428
1363
  }
1429
1364
  metadata = {"function_name": "agentic_object_detection"}
1430
1365
 
1431
- if fine_tune_id is not None:
1432
- landing_api = LandingPublicAPI()
1433
- status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
1434
- if status is not JobStatus.SUCCEEDED:
1435
- raise FineTuneModelIsNotReady(
1436
- f"Fine-tuned model {fine_tune_id} is not ready yet"
1437
- )
1438
-
1439
- # we can only execute fine-tuned models with florence2
1440
- payload = {
1441
- "prompts": payload["prompts"],
1442
- "jobId": fine_tune_id,
1443
- "model": "florence2",
1444
- }
1445
-
1446
1366
  detections = send_task_inference_request(
1447
1367
  payload,
1448
1368
  "text-to-object-detection",
@@ -1478,7 +1398,6 @@ def _agentic_object_detection(
1478
1398
  def agentic_object_detection(
1479
1399
  prompt: str,
1480
1400
  image: np.ndarray,
1481
- fine_tune_id: Optional[str] = None,
1482
1401
  ) -> List[Dict[str, Any]]:
1483
1402
  """'agentic_object_detection' is a tool that can detect multiple objects given a
1484
1403
  text prompt such as object names or referring expressions on images. It's
@@ -1490,8 +1409,6 @@ def agentic_object_detection(
1490
1409
  prompt (str): The prompt to ground to the image, only supports a single prompt
1491
1410
  with no commas or periods.
1492
1411
  image (np.ndarray): The image to ground the prompt to.
1493
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
1494
- fine-tuned model ID here to use it.
1495
1412
 
1496
1413
  Returns:
1497
1414
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -1513,9 +1430,7 @@ def agentic_object_detection(
1513
1430
  if image_size[0] < 1 or image_size[1] < 1:
1514
1431
  return []
1515
1432
 
1516
- ret = _agentic_object_detection(
1517
- prompt, image, image_size, fine_tune_id=fine_tune_id
1518
- )
1433
+ ret = _agentic_object_detection(prompt, image, image_size)
1519
1434
 
1520
1435
  _display_tool_trace(
1521
1436
  agentic_object_detection.__name__,
@@ -1586,7 +1501,6 @@ def agentic_sam2_video_tracking(
1586
1501
  prompt: str,
1587
1502
  frames: List[np.ndarray],
1588
1503
  chunk_length: Optional[int] = 25,
1589
- fine_tune_id: Optional[str] = None,
1590
1504
  ) -> List[List[Dict[str, Any]]]:
1591
1505
  """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
1592
1506
  objects in a video given a text prompt such as object names or referring
@@ -1601,8 +1515,6 @@ def agentic_sam2_video_tracking(
1601
1515
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
1602
1516
  chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
1603
1517
  to find new objects.
1604
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
1605
- fine-tuned model ID here to use it.
1606
1518
 
1607
1519
  Returns:
1608
1520
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -1638,7 +1550,6 @@ def agentic_sam2_video_tracking(
1638
1550
  prompt=prompt,
1639
1551
  frames=frames,
1640
1552
  chunk_length=chunk_length,
1641
- fine_tune_id=fine_tune_id,
1642
1553
  )
1643
1554
  _display_tool_trace(
1644
1555
  agentic_sam2_video_tracking.__name__,
@@ -159,11 +159,12 @@ def format_conversation(chat: List[AgentMessage]) -> str:
159
159
  chat = copy.deepcopy(chat)
160
160
  prompt = ""
161
161
  for chat_i in chat:
162
- if chat_i.role == "user" or chat_i.role == "coder":
163
- if "<final_code>" in chat_i.content:
164
- prompt += f"OBSERVATION: {chat_i.content}\n\n"
165
- elif chat_i.role == "user":
162
+ # we want to print user messages, final code, final code observations or errors
163
+ if chat_i.role in ["user", "coder", "final_observation", "error_observation"]:
164
+ if chat_i.role == "user":
166
165
  prompt += f"USER: {chat_i.content}\n\n"
166
+ else:
167
+ prompt += f"OBSERVATION: {chat_i.content}\n\n"
167
168
  elif chat_i.role == "conversation":
168
169
  prompt += f"AGENT: {chat_i.content}\n\n"
169
170
  return prompt
@@ -51,13 +51,6 @@ class RemoteSandboxClosedError(RemoteSandboxError):
51
51
  is_retryable = True
52
52
 
53
53
 
54
- class FineTuneModelIsNotReady(Exception):
55
- """Exception raised when the fine-tune model is not ready.
56
- If this is raised, it's recommended to wait 5 seconds before trying to use
57
- the model again.
58
- """
59
-
60
-
61
54
  class FineTuneModelNotFound(Exception):
62
55
  """Exception raised when the fine-tune model is not found.
63
56
  If this is raised, it's recommended to try another model id.
@@ -54,7 +54,7 @@ def process_segment(
54
54
  segment_frames: List[np.ndarray],
55
55
  od_model: ODModels,
56
56
  prompt: str,
57
- fine_tune_id: Optional[str],
57
+ deployment_id: Optional[str],
58
58
  chunk_length: Optional[int],
59
59
  image_size: Tuple[int, ...],
60
60
  segment_index: int,
@@ -67,7 +67,7 @@ def process_segment(
67
67
  segment_frames (List[np.ndarray]): Frames in the segment.
68
68
  od_model (ODModels): Object detection model to use.
69
69
  prompt (str): Prompt for the model.
70
- fine_tune_id (Optional[str]): Fine-tune model ID.
70
+ deployment_id (Optional[str]): The model deployment ID.
71
71
  chunk_length (Optional[int]): Chunk length for processing.
72
72
  image_size (Tuple[int, int]): Size of the images.
73
73
  segment_index (int): Index of the segment.
@@ -90,7 +90,12 @@ def process_segment(
90
90
  for idx in range(0, len(segment_frames), step):
91
91
  frame_number = idx
92
92
  segment_results[idx], function_name = object_detection_tool(
93
- od_model, prompt, segment_index, frame_number, fine_tune_id, segment_frames
93
+ deployment_id=deployment_id,
94
+ frame_number=frame_number,
95
+ od_model=od_model,
96
+ prompt=prompt,
97
+ segment_frames=segment_frames,
98
+ segment_index=segment_index,
94
99
  )
95
100
 
96
101
  transformed_detections = transform_detections(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.240
3
+ Version: 0.2.242
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -3,22 +3,21 @@ vision_agent/.sim_tools/embs.npy,sha256=pi7h3NHlrKncIGNR-oPn_XoTe2PzBb9-aFMi7qK0
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
5
5
  vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
6
- vision_agent/agent/agent.py,sha256=RoS7kMfXYILv0zuPpcxqQIlaHGa3K-qw_5EwgsEJTPQ,1530
6
+ vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
7
7
  vision_agent/agent/vision_agent.py,sha256=4LqvwPTSsiuJEDwBbMx9Dg9ALJwNR6x1c63TZvOMm8A,23486
8
8
  vision_agent/agent/vision_agent_coder.py,sha256=Ry6AiyAj3hsSeYPu_5guMcTzf2E4SoebPzpHyJtSPbQ,27360
9
9
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=D4RJxTWoxpl-WtYRvHNxaLSdWVHsdYb0jJIQ2ZCGU0A,12277
10
10
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
11
- vision_agent/agent/vision_agent_coder_v2.py,sha256=bWAUyk7-lYKwLIjkL_wUTeYv06zIIroJE1yIDRaGUHw,17059
11
+ vision_agent/agent/vision_agent_coder_v2.py,sha256=I4gWrneFIqhX6W-MxiaNyPKGk5tRKgC8xryV-YdeSZU,17289
12
12
  vision_agent/agent/vision_agent_planner.py,sha256=rp_atRMDg35WFXNKOTkjUpGPrpSCsiMhcfZtqK-DIV4,18668
13
13
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
14
14
  vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=TiiF5BGnFVraFlQnDaeRU67927LvszvpcMUOgVgo0ps,35843
15
- vision_agent/agent/vision_agent_planner_v2.py,sha256=IqEP5ded5P4ESkLEur81gUvJtTmTdlKAx9uQyLyIwPc,21212
15
+ vision_agent/agent/vision_agent_planner_v2.py,sha256=GOhaTsVCh02X09IKkC4k9z79lsmU4VgRW7WJLKjdG1k,21755
16
16
  vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
17
- vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
18
- vision_agent/agent/vision_agent_v2.py,sha256=O070_QdgsqNzex5eRtye8QmJgCtHcf_B7zeteWpw3LM,10895
17
+ vision_agent/agent/vision_agent_prompts_v2.py,sha256=jTfu_heNTBaHj1UNI0XIyyFDgDOjPTPP83vrS-g3A1U,2961
18
+ vision_agent/agent/vision_agent_v2.py,sha256=QPAyDjnRRHUCD4Pw4TQYffWkucbn4WkEjYn8dBIWll4,11682
19
19
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
21
- vision_agent/clients/landing_public_api.py,sha256=Vz9lldtNbaJRWzT7T8-uQrC-dMnt47LIsDrxHgoVdEw,1492
22
21
  vision_agent/configs/__init__.py,sha256=Iu75-w9_nlPmnB_qKA7nYaaaHf7xtTrDmK8N4v2WV34,27
23
22
  vision_agent/configs/anthropic_config.py,sha256=T1UuESgiY8913A6wA42P7-cg8FTk9-LkJpyywo7OnIQ,4298
24
23
  vision_agent/configs/anthropic_openai_config.py,sha256=rUz5zca4Pn5dTUwJXiJzRDYua5PWizApCKI3y0zOvhc,4699
@@ -28,28 +27,28 @@ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
28
27
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
29
28
  vision_agent/lmm/__init__.py,sha256=4qX2lmGnKWHeKftXueEi9xj_ieK2nQh_ipHf72nKGFk,84
30
29
  vision_agent/lmm/lmm.py,sha256=XYp1frrqQ-6q-0y2IWwM8-EIH5UrFZ21SAhkcM32J9w,19355
31
- vision_agent/models/__init__.py,sha256=qAdygB-0EsmxMHNzYTPNM6tAF8Fym95gm9bsHJafdgE,287
32
- vision_agent/models/agent_types.py,sha256=dIdxATH_PP76pD5Wfo0oofWt6iPQh0vpf48QbEQSzhs,2472
30
+ vision_agent/models/__init__.py,sha256=eIP0pD5dYog8zUA7uuTmUxCF6SIutbLRLRE0cmuCJgQ,326
31
+ vision_agent/models/agent_types.py,sha256=vBZ9-ns5lHDdFMO7ulCGGeZ6OwRo3gK4O3vN0814IWc,3064
33
32
  vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1cXmw,305
34
33
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
35
34
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
36
35
  vision_agent/sim/sim.py,sha256=VSU_1rYd4ifvF45xKWBEYugxdeeEQVpj0QL6rjx49i4,9801
37
- vision_agent/tools/__init__.py,sha256=T-MPNBVbvWtfo71hobaZsdYzQ52oyymolk_OAb2Pq_g,2463
38
- vision_agent/tools/meta_tools.py,sha256=-heMwGkx0hX_9zUp1dgBqsJpVnl6Y6tErMsjFy0dwLM,28652
36
+ vision_agent/tools/__init__.py,sha256=bYrOPuqrpwFA3TeY_pxRXVv61oJsxVWVgv1psJlBEcc,2391
37
+ vision_agent/tools/meta_tools.py,sha256=DNRXHX9nZ1GBeqeLiq87sBshoe0aiZeYasETbG-9neI,24053
39
38
  vision_agent/tools/planner_tools.py,sha256=orBTdJQz2NKoLuX9WE6XixaYuG305xz0UBYvZOiuquQ,19474
40
39
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
41
- vision_agent/tools/tools.py,sha256=8J-SYpyUeqMDajF7kp2aiTeBBQrJEWGVdEsQLPAc-OM,111511
40
+ vision_agent/tools/tools.py,sha256=uhvgPeAzhOV2vfBa216vq-JVItqgzIRKs1JMBezj2Es,107631
42
41
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
43
- vision_agent/utils/agent.py,sha256=QGKcbzpAjcVj0958bXYLv07-d2i1GU7-bXVG7bTGRMA,14619
44
- vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
42
+ vision_agent/utils/agent.py,sha256=8z4Ei0q397lVWUga8v9nQKuenGAsh2wfkAKQOB8CwpI,14701
43
+ vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
45
44
  vision_agent/utils/execute.py,sha256=vOEP5Ys7S2lc0_7pOJbgk7OaWi85hrCNu9_8Bo3zk6I,29356
46
45
  vision_agent/utils/image_utils.py,sha256=bJM2mEvB6E__M9pxi74yQYzAiZ7mu3KE2ptyVrp5vzQ,12533
47
46
  vision_agent/utils/tools.py,sha256=USZL0MKsiJgqA8RFiYRTcj_Kn2FVYKLHK4wIk0gP1Ow,7694
48
47
  vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
49
48
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
50
49
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
51
- vision_agent/utils/video_tracking.py,sha256=GM9qfeawqhmZVWoKrzw5-NETd4gEo7ImMfWtBnhC3bw,12086
52
- vision_agent-0.2.240.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
- vision_agent-0.2.240.dist-info/METADATA,sha256=l9FlzNIT3ncQNxkIlTTUsB1aaL-7u2b1OtvYcRv0AIE,5712
54
- vision_agent-0.2.240.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
55
- vision_agent-0.2.240.dist-info/RECORD,,
50
+ vision_agent/utils/video_tracking.py,sha256=eMIiWOG24bgXbqOy1DTtepO2gPo1ClW6Y0tdbEF_14k,12227
51
+ vision_agent-0.2.242.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
+ vision_agent-0.2.242.dist-info/METADATA,sha256=Lvr9OdngkgZJd-ifod6Wp8FuX0BnAmR6fZIelqAmjz8,5712
53
+ vision_agent-0.2.242.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
54
+ vision_agent-0.2.242.dist-info/RECORD,,
@@ -1,38 +0,0 @@
1
- import os
2
- from typing import List
3
- from uuid import UUID
4
-
5
- from requests.exceptions import HTTPError
6
-
7
- from vision_agent.clients.http import BaseHTTP
8
- from vision_agent.models import BboxInputBase64, JobStatus, PromptTask
9
- from vision_agent.utils.exceptions import FineTuneModelNotFound
10
- from vision_agent.utils.type_defs import LandingaiAPIKey
11
-
12
-
13
- class LandingPublicAPI(BaseHTTP):
14
- def __init__(self) -> None:
15
- landing_url = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
16
- landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
17
- headers = {"Content-Type": "application/json", "apikey": landing_api_key}
18
- super().__init__(base_endpoint=landing_url, headers=headers)
19
-
20
- def launch_fine_tuning_job(
21
- self, model_name: str, task: PromptTask, bboxes: List[BboxInputBase64]
22
- ) -> UUID:
23
- url = "v1/agent/jobs/fine-tuning"
24
- data = {
25
- "model": {"name": model_name, "task": task.value},
26
- "bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes],
27
- }
28
- response = self.post(url, payload=data)
29
- return UUID(response["jobId"])
30
-
31
- def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
32
- url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
33
- try:
34
- get_job = self.get(url)
35
- except HTTPError as err:
36
- if err.response.status_code == 404:
37
- raise FineTuneModelNotFound()
38
- return JobStatus(get_job["status"])