PyPI - vision-agent - Versions diffs - 0.2.56__py3-none-any.whl → 0.2.58__py3-none-any.whl - Mend

vision-agent 0.2.56py3-none-any.whl → 0.2.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

vision_agent/__init__.py +1 -2
vision_agent/agent/agent.py +3 -1
vision_agent/agent/vision_agent.py +110 -81
vision_agent/agent/vision_agent_prompts.py +1 -1
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +54 -116
vision_agent/tools/__init__.py +2 -1
vision_agent/tools/tools.py +3 -3
{vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/METADATA +36 -7
vision_agent-0.2.58.dist-info/RECORD +23 -0
vision_agent/agent/agent_coder.py +0 -216
vision_agent/agent/agent_coder_prompts.py +0 -135
vision_agent/agent/data_interpreter.py +0 -475
vision_agent/agent/data_interpreter_prompts.py +0 -186
vision_agent/agent/easytool.py +0 -346
vision_agent/agent/easytool_prompts.py +0 -89
vision_agent/agent/easytool_v2.py +0 -781
vision_agent/agent/easytool_v2_prompts.py +0 -152
vision_agent/agent/reflexion.py +0 -299
vision_agent/agent/reflexion_prompts.py +0 -100
vision_agent/llm/__init__.py +0 -1
vision_agent/llm/llm.py +0 -176
vision_agent/tools/easytool_tools.py +0 -1242
vision_agent-0.2.56.dist-info/RECORD +0 -36
{vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/LICENSE +0 -0
{vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/WHEEL +0 -0

vision_agent/agent/easytool.py DELETED Viewed

@@ -1,346 +0,0 @@
-import json
-import logging
-import sys
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from vision_agent.llm import LLM, OpenAILLM
-from vision_agent.lmm import LMM
-from vision_agent.tools.easytool_tools import TOOLS
-from .agent import Agent
-from .easytool_prompts import (
-    ANSWER_GENERATE,
-    ANSWER_SUMMARIZE,
-    CHOOSE_PARAMETER,
-    CHOOSE_TOOL,
-    TASK_DECOMPOSE,
-    TASK_TOPOLOGY,
-)
-logging.basicConfig(stream=sys.stdout)
-_LOGGER = logging.getLogger(__name__)
-def parse_json(s: str) -> Any:
-    s = (
-        s.replace(": true", ": True")
-        .replace(": false", ": False")
-        .replace(":true", ": True")
-        .replace(":false", ": False")
-        .replace("```", "")
-        .strip()
-    )
-    return json.loads(s)
-def change_name(name: str) -> str:
-    change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"]
-    if name in change_list:
-        name = "is_" + name.lower()
-    return name
-def format_tools(tools: Dict[int, Any]) -> str:
-    # Format this way so it's clear what the ID's are
-    tool_str = ""
-    for key in tools:
-        tool_str += f"ID: {key} - {tools[key]}\n"
-    return tool_str
-def topological_sort(tasks: List[Dict]) -> List[Dict]:
-    in_degree = {task["id"]: 0 for task in tasks}
-    for task in tasks:
-        for dep in task["dep"]:
-            if dep in in_degree:
-                in_degree[task["id"]] += 1
-    queue = [task for task in tasks if in_degree[task["id"]] == 0]
-    sorted_order = []
-    while queue:
-        current = queue.pop(0)
-        sorted_order.append(current)
-        for task in tasks:
-            if current["id"] in task["dep"]:
-                in_degree[task["id"]] -= 1
-                if in_degree[task["id"]] == 0:
-                    queue.append(task)
-    if len(sorted_order) != len(tasks):
-        completed_ids = set([task["id"] for task in sorted_order])
-        remaining_tasks = [task for task in tasks if task["id"] not in completed_ids]
-        sorted_order.extend(remaining_tasks)
-    return sorted_order
-def task_decompose(
-    model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any]
-) -> Optional[Dict]:
-    prompt = TASK_DECOMPOSE.format(question=question, tools=format_tools(tools))
-    tries = 0
-    str_result = ""
-    while True:
-        try:
-            str_result = model(prompt)
-            result = parse_json(str_result)
-            return result["Tasks"]  # type: ignore
-        except Exception:
-            if tries > 10:
-                _LOGGER.error(f"Failed task_decompose on: {str_result}")
-                return None
-            tries += 1
-            continue
-def task_topology(
-    model: Union[LLM, LMM, Agent], question: str, task_list: List[Dict]
-) -> List[Dict[str, Any]]:
-    prompt = TASK_TOPOLOGY.format(question=question, task_list=task_list)
-    tries = 0
-    str_result = ""
-    while True:
-        try:
-            str_result = model(prompt)
-            result = parse_json(str_result)
-            for elt in result["Tasks"]:
-                if isinstance(elt["dep"], str):
-                    elt["dep"] = [int(dep) for dep in elt["dep"].split(",")]
-                elif isinstance(elt["dep"], int):
-                    elt["dep"] = [elt["dep"]]
-                elif isinstance(elt["dep"], list):
-                    elt["dep"] = [int(dep) for dep in elt["dep"]]
-            return result["Tasks"]  # type: ignore
-        except Exception:
-            if tries > 10:
-                _LOGGER.error(f"Failed task_topology on: {str_result}")
-                return task_list
-            tries += 1
-            continue
-def choose_tool(
-    model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any]
-) -> Optional[int]:
-    prompt = CHOOSE_TOOL.format(question=question, tools=format_tools(tools))
-    tries = 0
-    str_result = ""
-    while True:
-        try:
-            str_result = model(prompt)
-            result = parse_json(str_result)
-            return result["ID"]  # type: ignore
-        except Exception:
-            if tries > 10:
-                _LOGGER.error(f"Failed choose_tool on: {str_result}")
-                return None
-            tries += 1
-            continue
-def choose_parameter(
-    model: Union[LLM, LMM, Agent], question: str, tool_usage: Dict, previous_log: str
-) -> Optional[Any]:
-    # TODO: should format tool_usage
-    prompt = CHOOSE_PARAMETER.format(
-        question=question, tool_usage=tool_usage, previous_log=previous_log
-    )
-    tries = 0
-    str_result = ""
-    while True:
-        try:
-            str_result = model(prompt)
-            result = parse_json(str_result)
-            return result["Parameters"]
-        except Exception:
-            if tries > 10:
-                _LOGGER.error(f"Failed choose_parameter on: {str_result}")
-                return None
-            tries += 1
-            continue
-def answer_generate(
-    model: Union[LLM, LMM, Agent], question: str, call_results: str, previous_log: str
-) -> str:
-    prompt = ANSWER_GENERATE.format(
-        question=question, call_results=call_results, previous_log=previous_log
-    )
-    return model(prompt)
-def answer_summarize(
-    model: Union[LLM, LMM, Agent], question: str, answers: List[Dict]
-) -> str:
-    prompt = ANSWER_SUMMARIZE.format(question=question, answers=answers)
-    return model(prompt)
-def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any:
-    try:
-        return tool()(**parameters)
-    except Exception as e:
-        _LOGGER.error(f"Failed function_call on: {e}")
-        return None
-def retrieval(
-    model: Union[LLM, LMM, Agent],
-    question: str,
-    tools: Dict[int, Any],
-    previous_log: str,
-) -> Tuple[List[Dict], str]:
-    tool_id = choose_tool(
-        model, question, {k: v["description"] for k, v in tools.items()}
-    )
-    if tool_id is None:
-        return [{}], ""
-    _LOGGER.info(f"\t(Tool ID, name): ({tool_id}, {tools[tool_id]['name']})")
-    tool_instructions = tools[tool_id]
-    tool_usage = tool_instructions["usage"]
-    tool_name = tool_instructions["name"]
-    parameters = choose_parameter(model, question, tool_usage, previous_log)
-    _LOGGER.info(f"\tParameters: {parameters} for {tool_name}")
-    if parameters is None:
-        return [{}], ""
-    tool_results = [
-        {"task": question, "tool_name": tool_name, "parameters": parameters}
-    ]
-    def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
-        call_results: List[Any] = []
-        if isinstance(result["parameters"], Dict):
-            call_result = function_call(tools[tool_id]["class"], result["parameters"])
-            if call_result is None:
-                return call_results
-            call_results.append(call_result)
-        elif isinstance(result["parameters"], List):
-            for parameters in result["parameters"]:
-                call_result = function_call(tools[tool_id]["class"], parameters)
-                if call_result is None:
-                    continue
-                call_results.append(call_result)
-        return call_results
-    call_results = []
-    for i, result in enumerate(tool_results):
-        call_results.extend(parse_tool_results(result))
-        tool_results[i]["call_results"] = call_results
-    call_results_str = "\n\n".join([str(e) for e in call_results if e is not None])
-    _LOGGER.info(f"\tCall Results: {call_results_str}")
-    return tool_results, call_results_str
-class EasyTool(Agent):
-    r"""This is an implementation of the EasyTool paper https://arxiv.org/abs/2401.06201
-    based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool
-    from the funcQA code.
-    Example
-    -------
-        >>> from vision_agent.agent import EasyTool
-        >>> agent = EasyTool()
-        >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?")
-        >>> print(resp)
-        "It will travel approximately 31.03 kilometers in 29 minutes."
-        >>> resp = agent("How many cards are in this image?", image="cards.jpg")
-        >>> print(resp)
-        "There are 2 cards in this image."
-    """
-    def __init__(
-        self,
-        task_model: Optional[Union[LLM, LMM]] = None,
-        answer_model: Optional[Union[LLM, LMM]] = None,
-        verbose: bool = False,
-    ):
-        self.task_model = (
-            OpenAILLM(json_mode=True) if task_model is None else task_model
-        )
-        self.answer_model = OpenAILLM() if answer_model is None else answer_model
-        self.retrieval_num = 3
-        self.tools = TOOLS
-        if verbose:
-            _LOGGER.setLevel(logging.INFO)
-    def __call__(
-        self,
-        input: Union[List[Dict[str, str]], str],
-        media: Optional[Union[str, Path]] = None,
-    ) -> str:
-        """Invoke the vision agent.
-        Parameters:
-            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
-            image: the input image referenced in the prompt parameter.
-        Returns:
-            A text response.
-        """
-        if isinstance(input, str):
-            input = [{"role": "user", "content": input}]
-        return self.chat(input, media=media)
-    def chat_with_workflow(
-        self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
-    ) -> Tuple[str, List[Dict]]:
-        question = chat[0]["content"]
-        if media:
-            question += f" Image name: {media}"
-        tasks = task_decompose(
-            self.task_model,
-            question,
-            {k: v["description"] for k, v in self.tools.items()},
-        )
-        _LOGGER.info(f"Tasks: {tasks}")
-        if tasks is not None:
-            task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)]
-            task_list = task_topology(self.task_model, question, task_list)
-            try:
-                task_list = topological_sort(task_list)
-            except Exception:
-                _LOGGER.error(f"Failed topological_sort on: {task_list}")
-        else:
-            task_list = []
-        _LOGGER.info(f"Task Dependency: {task_list}")
-        task_depend = {"Original Quesiton": question}
-        previous_log = ""
-        answers = []
-        for task in task_list:
-            task_depend[task["id"]] = {"task": task["task"], "answer": ""}  # type: ignore
-        all_tool_results = []
-        for task in task_list:
-            task_str = task["task"]
-            previous_log = str(task_depend)
-            _LOGGER.info(f"\tSubtask: {task_str}")
-            tool_results, call_results = retrieval(
-                self.task_model,
-                task_str,
-                self.tools,
-                previous_log,
-            )
-            answer = answer_generate(
-                self.answer_model, task_str, call_results, previous_log
-            )
-            for tool_result in tool_results:
-                tool_result["answer"] = answer
-            all_tool_results.extend(tool_results)
-            _LOGGER.info(f"\tAnswer: {answer}")
-            answers.append({"task": task_str, "answer": answer})
-            task_depend[task["id"]]["answer"] = answer  # type: ignore
-        return answer_summarize(self.answer_model, question, answers), all_tool_results
-    def chat(
-        self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
-    ) -> str:
-        answer, _ = self.chat_with_workflow(chat, media=media)
-        return answer

vision_agent/agent/easytool_prompts.py DELETED Viewed

@@ -1,89 +0,0 @@
-TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step.
-This is the user's question: {question}
-This is the tool list:
-{tools}
-Please note that:
-1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list.
-2. If one subtask needs the results from another subtask, you should write clearly. For example:
-{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}}
-3. You must ONLY output in a parsible JSON format. An example output looks like:
-{{"Tasks": ["Task 1", "Task 2", ...]}}
-Output: """
-TASK_TOPOLOGY = """Given a user's complex question, I have decomposed this question into some simple subtasks. I think there exist logical connections and order among the tasks. Thus, you need to help me output these logical connections and order.
-You must ONLY output in a parsible JSON format with the following format:
-{{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}}
-The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1.
-This is the user's question: {question}
-These are subtasks of this question:
-{task_list}
-Output: """
-CHOOSE_TOOL = """This is the user's question: {question}
-These are the tools you can select to solve the question:
-{tools}
-Please note that:
-1. You should only choose one tool from the Tool List to solve this question.
-2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
-Example 1: {{"ID": 1}}
-Example 2: {{"ID": 2}}
-Output: """
-CHOOSE_PARAMETER = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
-Please note that:
-1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs.
-2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
-3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
-4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
-5. If you need to use this API multiple times, please set "Parameters" to a list.
-6. You must ONLY output in a parsible JSON format. Two example outputs looks like:
-Example 1: {{"Parameters":{{"input": [1,2,3]}}}}
-Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}}
-These are logs of previous questions and answers:
-{previous_log}
-This is the current user's question: {question}
-This is the API tool documentation: {tool_usage}
-Output: """
-ANSWER_GENERATE = """You should answer the question based on the response output by the API tool.
-Please note that:
-1. Try to organize the response into a natural language answer.
-2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
-3. If the API tool does not provide useful information in the response, please answer with your knowledge.
-4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers.
-These are logs of previous questions and answers:
-{previous_log}
-This is the user's question: {question}
-This is the response output by the API tool:
-{call_results}
-We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible.
-Output: """
-ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question.
-This is the user's question: {question}
-These are subtasks and their answers:
-{answers}
-Final answer: """

vision-agent 0.2.56__py3-none-any.whl → 0.2.58__py3-none-any.whl

vision-agent 0.2.56py3-none-any.whl → 0.2.58py3-none-any.whl