vision-agent 0.2.102__py3-none-any.whl → 0.2.104__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent.py +1 -1
- vision_agent/agent/vision_agent_coder.py +9 -8
- vision_agent/lmm/lmm.py +157 -51
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +47 -0
- vision_agent/utils/image_utils.py +17 -0
- {vision_agent-0.2.102.dist-info → vision_agent-0.2.104.dist-info}/METADATA +1 -1
- {vision_agent-0.2.102.dist-info → vision_agent-0.2.104.dist-info}/RECORD +10 -10
- {vision_agent-0.2.102.dist-info → vision_agent-0.2.104.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.102.dist-info → vision_agent-0.2.104.dist-info}/WHEEL +0 -0
@@ -63,7 +63,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
63
63
|
dir=WORKSPACE,
|
64
64
|
conversation=conversation,
|
65
65
|
)
|
66
|
-
return extract_json(orch([{"role": "user", "content": prompt}]))
|
66
|
+
return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
|
67
67
|
|
68
68
|
|
69
69
|
def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
|
@@ -129,7 +129,7 @@ def write_plans(
|
|
129
129
|
context = USER_REQ.format(user_request=user_request)
|
130
130
|
prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
|
131
131
|
chat[-1]["content"] = prompt
|
132
|
-
return extract_json(model
|
132
|
+
return extract_json(model(chat, stream=False)) # type: ignore
|
133
133
|
|
134
134
|
|
135
135
|
def pick_plan(
|
@@ -160,7 +160,7 @@ def pick_plan(
|
|
160
160
|
docstring=tool_info, plans=plan_str, previous_attempts="", media=media
|
161
161
|
)
|
162
162
|
|
163
|
-
code = extract_code(model(prompt))
|
163
|
+
code = extract_code(model(prompt, stream=False)) # type: ignore
|
164
164
|
log_progress(
|
165
165
|
{
|
166
166
|
"type": "log",
|
@@ -211,7 +211,7 @@ def pick_plan(
|
|
211
211
|
"code": DefaultImports.prepend_imports(code),
|
212
212
|
}
|
213
213
|
)
|
214
|
-
code = extract_code(model(prompt))
|
214
|
+
code = extract_code(model(prompt, stream=False)) # type: ignore
|
215
215
|
tool_output = code_interpreter.exec_isolation(
|
216
216
|
DefaultImports.prepend_imports(code)
|
217
217
|
)
|
@@ -251,7 +251,7 @@ def pick_plan(
|
|
251
251
|
tool_output=tool_output_str[:20_000],
|
252
252
|
)
|
253
253
|
chat[-1]["content"] = prompt
|
254
|
-
best_plan = extract_json(model(chat))
|
254
|
+
best_plan = extract_json(model(chat, stream=False)) # type: ignore
|
255
255
|
|
256
256
|
if verbosity >= 1:
|
257
257
|
_LOGGER.info(f"Best plan:\n{best_plan}")
|
@@ -286,7 +286,7 @@ def write_code(
|
|
286
286
|
feedback=feedback,
|
287
287
|
)
|
288
288
|
chat[-1]["content"] = prompt
|
289
|
-
return extract_code(coder(chat))
|
289
|
+
return extract_code(coder(chat, stream=False)) # type: ignore
|
290
290
|
|
291
291
|
|
292
292
|
def write_test(
|
@@ -310,7 +310,7 @@ def write_test(
|
|
310
310
|
media=media,
|
311
311
|
)
|
312
312
|
chat[-1]["content"] = prompt
|
313
|
-
return extract_code(tester(chat))
|
313
|
+
return extract_code(tester(chat, stream=False)) # type: ignore
|
314
314
|
|
315
315
|
|
316
316
|
def write_and_test_code(
|
@@ -439,13 +439,14 @@ def debug_code(
|
|
439
439
|
while not success and count < 3:
|
440
440
|
try:
|
441
441
|
fixed_code_and_test = extract_json(
|
442
|
-
debugger(
|
442
|
+
debugger( # type: ignore
|
443
443
|
FIX_BUG.format(
|
444
444
|
code=code,
|
445
445
|
tests=test,
|
446
446
|
result="\n".join(result.text().splitlines()[-50:]),
|
447
447
|
feedback=format_memory(working_memory + new_working_memory),
|
448
|
-
)
|
448
|
+
),
|
449
|
+
stream=False,
|
449
450
|
)
|
450
451
|
)
|
451
452
|
success = True
|
vision_agent/lmm/lmm.py
CHANGED
@@ -5,7 +5,7 @@ import logging
|
|
5
5
|
import os
|
6
6
|
from abc import ABC, abstractmethod
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
8
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional, Union, cast
|
9
9
|
|
10
10
|
import anthropic
|
11
11
|
import requests
|
@@ -58,22 +58,24 @@ def encode_media(media: Union[str, Path]) -> str:
|
|
58
58
|
class LMM(ABC):
|
59
59
|
@abstractmethod
|
60
60
|
def generate(
|
61
|
-
self, prompt: str, media: Optional[List[Union[str, Path]]] = None
|
62
|
-
) -> str:
|
61
|
+
self, prompt: str, media: Optional[List[Union[str, Path]]] = None, **kwargs: Any
|
62
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
63
63
|
pass
|
64
64
|
|
65
65
|
@abstractmethod
|
66
66
|
def chat(
|
67
67
|
self,
|
68
68
|
chat: List[Message],
|
69
|
-
|
69
|
+
**kwargs: Any,
|
70
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
70
71
|
pass
|
71
72
|
|
72
73
|
@abstractmethod
|
73
74
|
def __call__(
|
74
75
|
self,
|
75
76
|
input: Union[str, List[Message]],
|
76
|
-
|
77
|
+
**kwargs: Any,
|
78
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
77
79
|
pass
|
78
80
|
|
79
81
|
|
@@ -104,15 +106,17 @@ class OpenAILMM(LMM):
|
|
104
106
|
def __call__(
|
105
107
|
self,
|
106
108
|
input: Union[str, List[Message]],
|
107
|
-
|
109
|
+
**kwargs: Any,
|
110
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
108
111
|
if isinstance(input, str):
|
109
|
-
return self.generate(input)
|
110
|
-
return self.chat(input)
|
112
|
+
return self.generate(input, **kwargs)
|
113
|
+
return self.chat(input, **kwargs)
|
111
114
|
|
112
115
|
def chat(
|
113
116
|
self,
|
114
117
|
chat: List[Message],
|
115
|
-
|
118
|
+
**kwargs: Any,
|
119
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
116
120
|
"""Chat with the LMM model.
|
117
121
|
|
118
122
|
Parameters:
|
@@ -141,17 +145,28 @@ class OpenAILMM(LMM):
|
|
141
145
|
)
|
142
146
|
fixed_chat.append(fixed_c)
|
143
147
|
|
148
|
+
# prefers kwargs from second dictionary over first
|
149
|
+
tmp_kwargs = self.kwargs | kwargs
|
144
150
|
response = self.client.chat.completions.create(
|
145
|
-
model=self.model_name, messages=fixed_chat, **
|
151
|
+
model=self.model_name, messages=fixed_chat, **tmp_kwargs # type: ignore
|
146
152
|
)
|
153
|
+
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
154
|
+
|
155
|
+
def f() -> Iterator[Optional[str]]:
|
156
|
+
for chunk in response:
|
157
|
+
chunk_message = chunk.choices[0].delta.content # type: ignore
|
158
|
+
yield chunk_message
|
147
159
|
|
148
|
-
|
160
|
+
return f()
|
161
|
+
else:
|
162
|
+
return cast(str, response.choices[0].message.content)
|
149
163
|
|
150
164
|
def generate(
|
151
165
|
self,
|
152
166
|
prompt: str,
|
153
167
|
media: Optional[List[Union[str, Path]]] = None,
|
154
|
-
|
168
|
+
**kwargs: Any,
|
169
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
155
170
|
message: List[Dict[str, Any]] = [
|
156
171
|
{
|
157
172
|
"role": "user",
|
@@ -173,10 +188,21 @@ class OpenAILMM(LMM):
|
|
173
188
|
},
|
174
189
|
)
|
175
190
|
|
191
|
+
# prefers kwargs from second dictionary over first
|
192
|
+
tmp_kwargs = self.kwargs | kwargs
|
176
193
|
response = self.client.chat.completions.create(
|
177
|
-
model=self.model_name, messages=message, **
|
194
|
+
model=self.model_name, messages=message, **tmp_kwargs # type: ignore
|
178
195
|
)
|
179
|
-
|
196
|
+
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
197
|
+
|
198
|
+
def f() -> Iterator[Optional[str]]:
|
199
|
+
for chunk in response:
|
200
|
+
chunk_message = chunk.choices[0].delta.content # type: ignore
|
201
|
+
yield chunk_message
|
202
|
+
|
203
|
+
return f()
|
204
|
+
else:
|
205
|
+
return cast(str, response.choices[0].message.content)
|
180
206
|
|
181
207
|
def generate_classifier(self, question: str) -> Callable:
|
182
208
|
api_doc = T.get_tool_documentation([T.clip])
|
@@ -309,20 +335,22 @@ class OllamaLMM(LMM):
|
|
309
335
|
self.url = base_url
|
310
336
|
self.model_name = model_name
|
311
337
|
self.json_mode = json_mode
|
312
|
-
self.
|
338
|
+
self.kwargs = kwargs
|
313
339
|
|
314
340
|
def __call__(
|
315
341
|
self,
|
316
342
|
input: Union[str, List[Message]],
|
317
|
-
|
343
|
+
**kwargs: Any,
|
344
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
318
345
|
if isinstance(input, str):
|
319
|
-
return self.generate(input)
|
320
|
-
return self.chat(input)
|
346
|
+
return self.generate(input, **kwargs)
|
347
|
+
return self.chat(input, **kwargs)
|
321
348
|
|
322
349
|
def chat(
|
323
350
|
self,
|
324
351
|
chat: List[Message],
|
325
|
-
|
352
|
+
**kwargs: Any,
|
353
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
326
354
|
"""Chat with the LMM model.
|
327
355
|
|
328
356
|
Parameters:
|
@@ -341,40 +369,85 @@ class OllamaLMM(LMM):
|
|
341
369
|
url = f"{self.url}/chat"
|
342
370
|
model = self.model_name
|
343
371
|
messages = fixed_chat
|
344
|
-
data = {"model": model, "messages": messages
|
372
|
+
data = {"model": model, "messages": messages}
|
373
|
+
|
374
|
+
tmp_kwargs = self.kwargs | kwargs
|
375
|
+
data.update(tmp_kwargs)
|
345
376
|
json_data = json.dumps(data)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
377
|
+
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
378
|
+
|
379
|
+
def f() -> Iterator[Optional[str]]:
|
380
|
+
with requests.post(url, data=json_data, stream=True) as stream:
|
381
|
+
if stream.status_code != 200:
|
382
|
+
raise ValueError(
|
383
|
+
f"Request failed with status code {stream.status_code}"
|
384
|
+
)
|
385
|
+
|
386
|
+
for chunk in stream.iter_content(chunk_size=None):
|
387
|
+
chunk_data = json.loads(chunk)
|
388
|
+
if chunk_data["done"]:
|
389
|
+
yield None
|
390
|
+
else:
|
391
|
+
yield chunk_data["message"]["content"]
|
392
|
+
|
393
|
+
return f()
|
394
|
+
else:
|
395
|
+
stream = requests.post(url, data=json_data)
|
396
|
+
if stream.status_code != 200:
|
397
|
+
raise ValueError(
|
398
|
+
f"Request failed with status code {stream.status_code}"
|
399
|
+
)
|
400
|
+
stream = stream.json()
|
401
|
+
return stream["message"]["content"] # type: ignore
|
351
402
|
|
352
403
|
def generate(
|
353
404
|
self,
|
354
405
|
prompt: str,
|
355
406
|
media: Optional[List[Union[str, Path]]] = None,
|
356
|
-
|
407
|
+
**kwargs: Any,
|
408
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
357
409
|
|
358
410
|
url = f"{self.url}/generate"
|
359
411
|
data = {
|
360
412
|
"model": self.model_name,
|
361
413
|
"prompt": prompt,
|
362
414
|
"images": [],
|
363
|
-
"stream": self.stream,
|
364
415
|
}
|
365
416
|
|
366
|
-
json_data = json.dumps(data)
|
367
417
|
if media and len(media) > 0:
|
368
418
|
for m in media:
|
369
419
|
data["images"].append(encode_media(m)) # type: ignore
|
370
420
|
|
371
|
-
|
421
|
+
tmp_kwargs = self.kwargs | kwargs
|
422
|
+
data.update(tmp_kwargs)
|
423
|
+
json_data = json.dumps(data)
|
424
|
+
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
425
|
+
|
426
|
+
def f() -> Iterator[Optional[str]]:
|
427
|
+
with requests.post(url, data=json_data, stream=True) as stream:
|
428
|
+
if stream.status_code != 200:
|
429
|
+
raise ValueError(
|
430
|
+
f"Request failed with status code {stream.status_code}"
|
431
|
+
)
|
432
|
+
|
433
|
+
for chunk in stream.iter_content(chunk_size=None):
|
434
|
+
chunk_data = json.loads(chunk)
|
435
|
+
if chunk_data["done"]:
|
436
|
+
yield None
|
437
|
+
else:
|
438
|
+
yield chunk_data["response"]
|
372
439
|
|
373
|
-
|
374
|
-
|
440
|
+
return f()
|
441
|
+
else:
|
442
|
+
stream = requests.post(url, data=json_data)
|
443
|
+
|
444
|
+
if stream.status_code != 200:
|
445
|
+
raise ValueError(
|
446
|
+
f"Request failed with status code {stream.status_code}"
|
447
|
+
)
|
375
448
|
|
376
|
-
|
377
|
-
|
449
|
+
stream = stream.json()
|
450
|
+
return stream["response"] # type: ignore
|
378
451
|
|
379
452
|
|
380
453
|
class ClaudeSonnetLMM(LMM):
|
@@ -385,27 +458,28 @@ class ClaudeSonnetLMM(LMM):
|
|
385
458
|
api_key: Optional[str] = None,
|
386
459
|
model_name: str = "claude-3-sonnet-20240229",
|
387
460
|
max_tokens: int = 4096,
|
388
|
-
temperature: float = 0.7,
|
389
461
|
**kwargs: Any,
|
390
462
|
):
|
391
463
|
self.client = anthropic.Anthropic(api_key=api_key)
|
392
464
|
self.model_name = model_name
|
393
|
-
|
394
|
-
|
465
|
+
if "max_tokens" not in kwargs:
|
466
|
+
kwargs["max_tokens"] = max_tokens
|
395
467
|
self.kwargs = kwargs
|
396
468
|
|
397
469
|
def __call__(
|
398
470
|
self,
|
399
471
|
input: Union[str, List[Dict[str, Any]]],
|
400
|
-
|
472
|
+
**kwargs: Any,
|
473
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
401
474
|
if isinstance(input, str):
|
402
|
-
return self.generate(input)
|
403
|
-
return self.chat(input)
|
475
|
+
return self.generate(input, **kwargs)
|
476
|
+
return self.chat(input, **kwargs)
|
404
477
|
|
405
478
|
def chat(
|
406
479
|
self,
|
407
480
|
chat: List[Dict[str, Any]],
|
408
|
-
|
481
|
+
**kwargs: Any,
|
482
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
409
483
|
messages: List[MessageParam] = []
|
410
484
|
for msg in chat:
|
411
485
|
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
@@ -426,20 +500,35 @@ class ClaudeSonnetLMM(LMM):
|
|
426
500
|
)
|
427
501
|
messages.append({"role": msg["role"], "content": content})
|
428
502
|
|
503
|
+
# prefers kwargs from second dictionary over first
|
504
|
+
tmp_kwargs = self.kwargs | kwargs
|
429
505
|
response = self.client.messages.create(
|
430
|
-
model=self.model_name,
|
431
|
-
max_tokens=self.max_tokens,
|
432
|
-
temperature=self.temperature,
|
433
|
-
messages=messages,
|
434
|
-
**self.kwargs,
|
506
|
+
model=self.model_name, messages=messages, **tmp_kwargs
|
435
507
|
)
|
436
|
-
|
508
|
+
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
509
|
+
|
510
|
+
def f() -> Iterator[Optional[str]]:
|
511
|
+
for chunk in response:
|
512
|
+
if (
|
513
|
+
chunk.type == "message_start"
|
514
|
+
or chunk.type == "content_block_start"
|
515
|
+
):
|
516
|
+
continue
|
517
|
+
elif chunk.type == "content_block_delta":
|
518
|
+
yield chunk.delta.text
|
519
|
+
elif chunk.type == "message_stop":
|
520
|
+
yield None
|
521
|
+
|
522
|
+
return f()
|
523
|
+
else:
|
524
|
+
return cast(str, response.content[0].text)
|
437
525
|
|
438
526
|
def generate(
|
439
527
|
self,
|
440
528
|
prompt: str,
|
441
529
|
media: Optional[List[Union[str, Path]]] = None,
|
442
|
-
|
530
|
+
**kwargs: Any,
|
531
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
443
532
|
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
444
533
|
TextBlockParam(type="text", text=prompt)
|
445
534
|
]
|
@@ -456,11 +545,28 @@ class ClaudeSonnetLMM(LMM):
|
|
456
545
|
},
|
457
546
|
)
|
458
547
|
)
|
548
|
+
|
549
|
+
# prefers kwargs from second dictionary over first
|
550
|
+
tmp_kwargs = self.kwargs | kwargs
|
459
551
|
response = self.client.messages.create(
|
460
552
|
model=self.model_name,
|
461
|
-
max_tokens=self.max_tokens,
|
462
|
-
temperature=self.temperature,
|
463
553
|
messages=[{"role": "user", "content": content}],
|
464
|
-
**
|
554
|
+
**tmp_kwargs,
|
465
555
|
)
|
466
|
-
|
556
|
+
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
557
|
+
|
558
|
+
def f() -> Iterator[Optional[str]]:
|
559
|
+
for chunk in response:
|
560
|
+
if (
|
561
|
+
chunk.type == "message_start"
|
562
|
+
or chunk.type == "content_block_start"
|
563
|
+
):
|
564
|
+
continue
|
565
|
+
elif chunk.type == "content_block_delta":
|
566
|
+
yield chunk.delta.text
|
567
|
+
elif chunk.type == "message_stop":
|
568
|
+
yield None
|
569
|
+
|
570
|
+
return f()
|
571
|
+
else:
|
572
|
+
return cast(str, response.content[0].text)
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
|
|
28
28
|
denormalize_bbox,
|
29
29
|
get_image_size,
|
30
30
|
normalize_bbox,
|
31
|
+
convert_quad_box_to_bbox,
|
31
32
|
rle_decode,
|
32
33
|
)
|
33
34
|
|
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
|
|
652
653
|
return return_data
|
653
654
|
|
654
655
|
|
656
|
+
def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
657
|
+
"""'florencev2_ocr' is a tool that can detect text and text regions in an image.
|
658
|
+
Each text region contains one line of text. It returns a list of detected text,
|
659
|
+
the text region as a bounding box with normalized coordinates, and confidence
|
660
|
+
scores. The results are sorted from top-left to bottom right.
|
661
|
+
|
662
|
+
Parameters:
|
663
|
+
image (np.ndarray): The image to extract text from.
|
664
|
+
|
665
|
+
Returns:
|
666
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
667
|
+
with nornmalized coordinates, and confidence score.
|
668
|
+
|
669
|
+
Example
|
670
|
+
-------
|
671
|
+
>>> florencev2_ocr(image)
|
672
|
+
[
|
673
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
674
|
+
]
|
675
|
+
"""
|
676
|
+
|
677
|
+
image_size = image.shape[:2]
|
678
|
+
image_b64 = convert_to_b64(image)
|
679
|
+
data = {
|
680
|
+
"image": image_b64,
|
681
|
+
"task": "<OCR_WITH_REGION>",
|
682
|
+
"function_name": "florencev2_ocr",
|
683
|
+
}
|
684
|
+
|
685
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
686
|
+
detections = detections["<OCR_WITH_REGION>"]
|
687
|
+
return_data = []
|
688
|
+
for i in range(len(detections["quad_boxes"])):
|
689
|
+
return_data.append(
|
690
|
+
{
|
691
|
+
"label": detections["labels"][i],
|
692
|
+
"bbox": normalize_bbox(
|
693
|
+
convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
|
694
|
+
),
|
695
|
+
"score": 1.0,
|
696
|
+
}
|
697
|
+
)
|
698
|
+
return return_data
|
699
|
+
|
700
|
+
|
655
701
|
def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
656
702
|
"""'detr_segmentation' is a tool that can segment common objects in an
|
657
703
|
image without any text prompt. It returns a list of detected objects
|
@@ -1248,6 +1294,7 @@ TOOLS = [
|
|
1248
1294
|
loca_visual_prompt_counting,
|
1249
1295
|
florencev2_roberta_vqa,
|
1250
1296
|
florencev2_image_caption,
|
1297
|
+
florencev2_ocr,
|
1251
1298
|
detr_segmentation,
|
1252
1299
|
depth_anything_v2,
|
1253
1300
|
generate_soft_edge_image,
|
@@ -140,6 +140,23 @@ def denormalize_bbox(
|
|
140
140
|
return bbox
|
141
141
|
|
142
142
|
|
143
|
+
def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
|
144
|
+
r"""Convert a quadrilateral bounding box to a rectangular bounding box.
|
145
|
+
|
146
|
+
Parameters:
|
147
|
+
quad_box: the quadrilateral bounding box
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
The rectangular bounding box
|
151
|
+
"""
|
152
|
+
x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
|
153
|
+
x_min = min(x1, x2, x3, x4)
|
154
|
+
x_max = max(x1, x2, x3, x4)
|
155
|
+
y_min = min(y1, y2, y3, y4)
|
156
|
+
y_max = max(y1, y2, y3, y4)
|
157
|
+
return [x_min, y_min, x_max, y_max]
|
158
|
+
|
159
|
+
|
143
160
|
def overlay_bboxes(
|
144
161
|
image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
|
145
162
|
) -> ImageType:
|
@@ -2,8 +2,8 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9Ks,135
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=4vzKYNoScv_sOZiqefo46iKJNZOtqSFvSJif0zZIdLI,8471
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=oo3IoRrc-cVdjKq_YsjzkBZNTBtiCTIctGfeC5C7MXM,30926
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -12,22 +12,22 @@ vision_agent/clients/landing_public_api.py,sha256=Tjl8uBZWc3dvrCOKg-PCYjw3RC3X5Y
|
|
12
12
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
|
-
vision_agent/lmm/lmm.py,sha256=
|
15
|
+
vision_agent/lmm/lmm.py,sha256=TgEwrtQqpnWlBYEvsSU6DbkY3Y7MM8wRb4lMQgSiM0k,19435
|
16
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=MK0D8NtIChwGHwqsTz3LeV5BGuQecNVrNzUsyaEwuGA,1926
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
19
|
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
20
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
21
21
|
vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
|
22
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
+
vision_agent/tools/tools.py,sha256=fgPE0VHfBiQPJKkslBm_hugTOyRT-Hnw7eztvC-l4_o,44661
|
23
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
24
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
25
25
|
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
26
|
-
vision_agent/utils/image_utils.py,sha256=
|
26
|
+
vision_agent/utils/image_utils.py,sha256=c_g5i_cFC0C-Yw9gU_NaVgQdmBlyumw3bLIDtCU42xo,8200
|
27
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.104.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.104.dist-info/METADATA,sha256=aSP8goyL8RZS_6SZSzrJZCsIzySrN_domJ2vvvbedQg,10729
|
32
|
+
vision_agent-0.2.104.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.104.dist-info/RECORD,,
|
File without changes
|
File without changes
|