PyPI - user-simulator - Versions diffs - 0.1.0__py3-none-any.whl - Mend

user-simulator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

user_sim/__init__.py +0 -0
user_sim/cli/__init__.py +0 -0
user_sim/cli/gen_user_profile.py +34 -0
user_sim/cli/init_project.py +65 -0
user_sim/cli/sensei_chat.py +481 -0
user_sim/cli/sensei_check.py +103 -0
user_sim/cli/validation_check.py +143 -0
user_sim/core/__init__.py +0 -0
user_sim/core/ask_about.py +665 -0
user_sim/core/data_extraction.py +260 -0
user_sim/core/data_gathering.py +134 -0
user_sim/core/interaction_styles.py +147 -0
user_sim/core/role_structure.py +608 -0
user_sim/core/user_simulator.py +302 -0
user_sim/handlers/__init__.py +0 -0
user_sim/handlers/asr_module.py +128 -0
user_sim/handlers/html_parser_module.py +202 -0
user_sim/handlers/image_recognition_module.py +139 -0
user_sim/handlers/pdf_parser_module.py +123 -0
user_sim/utils/__init__.py +0 -0
user_sim/utils/config.py +47 -0
user_sim/utils/cost_tracker.py +153 -0
user_sim/utils/cost_tracker_v2.py +193 -0
user_sim/utils/errors.py +15 -0
user_sim/utils/exceptions.py +47 -0
user_sim/utils/languages.py +78 -0
user_sim/utils/register_management.py +62 -0
user_sim/utils/show_logs.py +63 -0
user_sim/utils/token_cost_calculator.py +338 -0
user_sim/utils/url_management.py +60 -0
user_sim/utils/utilities.py +568 -0
user_simulator-0.1.0.dist-info/METADATA +733 -0
user_simulator-0.1.0.dist-info/RECORD +37 -0
user_simulator-0.1.0.dist-info/WHEEL +5 -0
user_simulator-0.1.0.dist-info/entry_points.txt +6 -0
user_simulator-0.1.0.dist-info/licenses/LICENSE.txt +21 -0
user_simulator-0.1.0.dist-info/top_level.txt +1 -0

user_sim/core/data_extraction.py ADDED Viewed

@@ -0,0 +1,260 @@
+import re
+import logging
+from dateutil import parser
+from langchain_core.prompts import ChatPromptTemplate
+from user_sim.utils.token_cost_calculator import calculate_cost
+from user_sim.utils import config
+from user_sim.utils.utilities import init_model
+from datetime import date
+model = ""
+llm = None
+logger = logging.getLogger('Info Logger')
+def init_data_extraction_module():
+    global model
+    global llm
+    model, llm = init_model()
+class DataExtraction:
+    def __init__(self, conversation, variable_name, dtype, description):
+        self.model = "gpt-4o-mini"
+        self.message = f"{conversation['interaction']}"
+        self.dtype = dtype
+        self.variable = variable_name
+        self.description = description
+        self.system = f"""
+        You're an assistant that analyzes a conversation between a user and a chatbot.
+        Your objective is to test the chatbot's capabilities by extracting the information only if the chatbot provides it
+        or verifies it. Output only the requested data, If you couldn't find it, output None.
+        """
+    @staticmethod
+    def data_process(text, dtype):
+        logger.info(f'input text on data process for casting: {text}')
+        if text is None or text == 'null':
+            return text
+        try:
+            if dtype == 'int':
+                return int(text)
+            elif dtype == 'float':
+                return float(text)
+            elif dtype == 'money':
+                return text
+            elif dtype == 'str':
+                return str(text)
+            elif dtype == 'bool':
+                return bool(text)
+            elif dtype == 'time':
+                # time = parser.parse(text).time().strftime("%H:%M:%S")
+                time = str(text)
+                return time
+            elif dtype == 'date':
+                # date = parser.parse(text).date()
+                date = str(text)
+                return date
+            else:
+                return text
+        except ValueError as e:
+            logger.warning(f"Error in casting: {e}. Returning 'str({str(text)})'.")
+            return str(text)
+    def get_data_prompt(self, dtype):
+        time_format = "hh:mm:ss"
+        date_format = "month/day/year"
+        todays_date = date.today()
+        if "time(" in dtype:
+            match = re.findall(r'\((.*?)\)', dtype)
+            if match:
+                time_format = match
+            dtype = "time"
+        if "date(" in dtype:
+            match = re.findall(r'\((.*?)\)', dtype)
+            if match:
+                date_format = match
+            dtype = "date"
+        data_type = {'int': 'integer',
+                     'float': 'number',
+                     'string': 'string',
+                     'time': 'string',
+                     'bool': 'boolean',
+                     'date': 'string',
+                     'list': 'array'}
+        data_format = {'int': '',
+                       'float': '',
+                       'string': "Extract and  display concisely only the requested information "
+                              "without including additional context",
+                       'time': f'Output just the time data (not date) following strictly this format: {time_format}',
+                       'bool': '',
+                       'list': 'Output only the content to list.',
+                       'date': f'''
+                       Output just the date data (not time) following strictly this format: {date_format}.
+                       If you're getting a relative date, for example, "tomorrow", "yesterday", "in two days",
+                       keep in mind that today is {todays_date}.
+'''}
+        prompt_type = data_type.get(dtype)
+        d_format = data_format.get(dtype)
+        return prompt_type, d_format
+    def static_extraction(self, dtype, dformat, list_dtype):
+        parsed_input_message = self.system + self.message
+        description = f"{self.description}. {dformat}"
+        prompt = ChatPromptTemplate.from_messages([("system", self.system + description), ("human", "{input}")])
+        if dtype == "array":
+            answer = {
+                "type": [dtype, 'null'],
+                "items": {
+                    "type": list_dtype
+                }
+            }
+        else:
+            answer = {
+                "type": [dtype, 'null'],
+            }
+        response_format =  {
+                    "title": "Data_extraction",
+                    "description": description,
+                    "type": "object",
+                    "properties": {
+                        "answer": answer
+                    },
+                    "required": ['answer'],
+                    "additionalProperties": False,
+                }
+        structured_llm = llm.with_structured_output(response_format)
+        prompted_structured_llm = prompt | structured_llm
+        response = prompted_structured_llm.invoke({"input": self.message})
+        output_message = response["answer"]
+        if config.token_count_enabled:
+            calculate_cost(parsed_input_message, output_message, model=self.model, module="data_extraction")
+        return output_message
+    def dynamic_extraction(self, extraction, llm_output):
+        extraction_keys = list(extraction.keys())
+        field_definitions = {key: ([self.get_data_prompt(extraction[key]["type"])[0], "null"], extraction[key]["description"]) for key in extraction_keys}
+        if llm_output is None:
+            logger.warning("Couldn't get an answer from static extraction.")
+            llm_output = "none"
+        message = llm_output
+        parsed_input_message = self.system + message
+        properties = {}
+        required = []
+        for field_name, (field_type, field_description) in field_definitions.items():
+            properties[field_name] = {
+                "type": field_type,
+                "description": field_description
+            }
+            required.append(field_name)
+        response_format = {
+            "title": "data_extraction",
+            "description": "The data you want to extract",
+            "type": "object",
+            "properties": properties,
+            "required": required,
+            "additionalProperties": False,
+        }
+        prompt = ChatPromptTemplate.from_messages([("system", self.system), ("human", "{input}")])
+        structured_llm = llm.with_structured_output(response_format)
+        prompted_structured_llm = prompt | structured_llm
+        response = prompted_structured_llm.invoke({"input": message})
+        llm_output = response
+        parsed_output_message = str(response)
+        if config.token_count_enabled:
+            calculate_cost(parsed_input_message, parsed_output_message, model=self.model, module="data_extraction")
+        return llm_output
+    def get_data_extraction(self):
+        custom_types_names = list(config.types_dict.keys())
+        if llm is None:
+            logger.error("data extraction module not initialized.")
+            return {"output": None}
+        list_dtype = None
+        # If data type is custom
+        if self.dtype in custom_types_names:
+            type_yaml = config.types_dict.get(self.dtype, "string")
+            dformat = f"Data should be strictly outputted following regular expression pattern: {type_yaml['format']}"
+            if isinstance(type_yaml["extraction"], dict):
+                dtype = self.get_data_prompt("string")
+                static_output = self.static_extraction(dtype[0], dformat, list_dtype)
+                llm_output = self.dynamic_extraction(type_yaml["extraction"], static_output)
+                return {self.variable: llm_output}
+            else:
+                dtype = self.get_data_prompt(type_yaml["extraction"])
+                llm_output = self.static_extraction(dtype[0], dformat, list_dtype)
+                logger.info(f'LLM output for data extraction: {llm_output}')
+                return {self.variable: llm_output}
+        # If data type is predefined
+        else:
+            if "list" in self.dtype:
+                pattern = r'(\w+)\[(.*?)\]'
+                match = re.match(pattern, self.dtype)
+                if match:
+                    list_name = match.group(1)
+                    content = match.group(2)
+                    dtype = self.get_data_prompt(list_name)[0]
+                    list_dtype = self.get_data_prompt(content)[0]
+                    dformat = self.get_data_prompt(list_name)[1]
+                else:
+                    logger.error("Invalid structure on list for output data. Using 'string' by default.")
+                    dtype = self.get_data_prompt('string')[0]
+                    dformat = self.get_data_prompt('string')[1]
+            else:
+                dtype, dformat = self.get_data_prompt(self.dtype)
+            if dtype is None:
+                logger.warning(f"Data type {self.dtype} is not supported. Using 'string' by default.")
+                dtype = 'string'
+            if dformat is None:
+                logger.warning(f"Data format for {self.dtype} is not supported. Using default format.")
+                dformat = "Extract and display concisely only the requested information without including additional context"
+            llm_output = self.static_extraction(dtype, dformat, list_dtype)
+            logger.info(f'LLM output for data extraction: {llm_output}')
+            # text = llm_output['answer']
+            data = self.data_process(llm_output, self.dtype)
+            return {self.variable: data}

user_sim/core/data_gathering.py ADDED Viewed

@@ -0,0 +1,134 @@
+import ast
+import pandas as pd
+from user_sim.utils.token_cost_calculator import calculate_cost, max_output_tokens_allowed, max_input_tokens_allowed
+import re
+from user_sim.utils.exceptions import *
+from user_sim.utils.utilities import init_model
+from user_sim.utils import config
+from langchain_core.prompts import ChatPromptTemplate
+model = " "
+llm = None
+import logging
+logger = logging.getLogger('Info Logger')
+def init_data_gathering_module():
+    global model
+    global llm
+    model, llm = init_model()
+def extract_dict(in_val):
+    reg_ex = r'\{[^{}]*\}'
+    coincidence = re.search(reg_ex, in_val, re.DOTALL)
+    if coincidence:
+        return coincidence.group(0)
+    else:
+        return None
+def to_dict(in_val):
+    try:
+        dictionary = ast.literal_eval(extract_dict(in_val))
+    except (BadDictionaryGeneration, ValueError) as e:
+        logger.error(f"Bad dictionary generation: {e}. Setting empty dictionary value.")
+        dictionary = {}
+    return dictionary
+class ChatbotAssistant:
+    def __init__(self, ask_about):
+        self.verification_description = "the following has been answered, confirmed or provided by the chatbot:"
+        self.data_description = """"the piece of the conversation where the following has been answered
+                                or confirmed by the assistant. Don't consider the user's interactions:"""
+        self.properties = self.process_ask_about(ask_about)
+        self.system_message = """You are a helpful assistant that detects when a query in a conversation
+                                has been answered, confirmed or provided by the chatbot."""
+        self.messages = ""
+        self.gathering_register = {}
+    def process_ask_about(self, ask_about):
+        properties = {
+        }
+        for ab in ask_about:
+            properties[ab.replace(' ', '_')] = {
+                "type": "object",
+                "properties": {
+                    "verification": {
+                        "type": "boolean",
+                        "description": f"{self.verification_description} {ab}"
+                    },
+                    "data": {
+                        "type": ["string", "null"],
+                        "description": f"{self.data_description} {ab} "
+                    }
+                },
+                "required": ["verification", "data"],
+                "additionalProperties": False
+            }
+        return properties
+    def add_message(self, history):     # adds directly the chat history from user_simulator "self.conversation_history"
+        text = ""
+        for entry in history['interaction']:
+            for speaker, message in entry.items():
+                text += f"{speaker}: {message}\n"
+        self.messages = text
+        self.gathering_register = self.create_dataframe()
+    def get_json(self):
+        response_format = {
+                "title": "data_gathering",
+                "type": "object",
+                "description": "The information to check.",
+                "properties": self.properties,
+                "required": list(self.properties.keys()),
+                "additionalProperties": False
+        }
+        parsed_input_message = self.messages + self.verification_description + self.data_description
+        if llm is None:
+            logger.error("data gathering module not initialized.")
+            return "Empty data"
+        if max_input_tokens_allowed(parsed_input_message, model):
+            logger.error(f"Token limit was surpassed")
+            return None
+        if config.token_count_enabled:
+            llm.max_tokens = max_output_tokens_allowed(model)
+        prompt = ChatPromptTemplate.from_messages([("system", self.system_message), ("human", "{input}")])
+        structured_llm = llm.with_structured_output(response_format)
+        prompted_structured_llm = prompt | structured_llm
+        try:
+            response = prompted_structured_llm.invoke({"input": self.messages})
+            parsed_output_message = str(response)
+        except Exception as e:
+            logger.error(f"Truncated data in message: {e}")
+            response = parsed_output_message = None
+        if config.token_count_enabled:
+            calculate_cost(parsed_input_message, parsed_output_message, model=config.model, module="data_extraction")
+        return response
+    def create_dataframe(self):
+        data_dict = self.get_json()
+        if data_dict is None:
+            df = self.gathering_register
+        else:
+            try:
+                df = pd.DataFrame.from_dict(data_dict, orient='index')
+            except Exception as e:
+                logger.error(f"{e}. data_dict: {data_dict}. Retrieving data frame from gathering_register")
+                df = self.gathering_register
+        return df

user_sim/core/interaction_styles.py ADDED Viewed

@@ -0,0 +1,147 @@
+import random
+import logging
+logger = logging.getLogger('Info Logger')
+def find_instance(instances, i_class):
+    for instance in instances:
+        if isinstance(instance, i_class):
+            return instance
+    return None
+def create_instance(class_list, interaction_styles):
+    instances = []
+    for class_info in class_list:
+        class_name = class_info['clase']
+        args = class_info.get('args', [])
+        kwargs = class_info.get('kwargs', {})
+        if class_name in interaction_styles:
+            instance = interaction_styles[class_name](*args, **kwargs)
+            instances.append(instance)
+        else:
+            raise ValueError(f"Couldn't find {class_name} in interaction list.")
+    return instances
+class InteractionStyle:
+    def __init__(self, inter_type):
+        self.inter_type = inter_type
+        self.change_language_flag = False
+        self.languages_options = []
+    def get_prompt(self):
+        return
+    def get_metadata(self):
+        return
+class LongPhrases(InteractionStyle):
+    def __init__(self):
+        super().__init__(inter_type='long phrases')
+    def get_prompt(self):
+        return "use very long phrases to write anything. "
+    def get_metadata(self):
+        return self.inter_type
+class ChangeYourMind(InteractionStyle):
+    def __init__(self):
+        super().__init__(inter_type='change your mind')
+    def get_prompt(self):
+        return "eventually, change your mind about any information you provided. "
+    def get_metadata(self):
+        return self.inter_type
+class ChangeLanguage(InteractionStyle):
+    # TODO: add chance variable with *args
+    def __init__(self, default_language):
+        super().__init__(inter_type='change language')
+        self.default_language = default_language
+        self.languages_list = []
+        self.chance = 0.3
+    def get_prompt(self):
+        lang = self.language(self.chance)
+        prompt = f"""Please, always talk in {lang}, even If the assistant tells you that he doesn't understand,
+                or you had a conversation in another language before. """
+        return prompt
+    def language(self, chance=0.3):
+        chance = chance*100
+        rand_number = random.randint(1, 100)
+        if rand_number <= chance:
+            lang = random.choice(self.languages_options)
+            logger.info(f'Language was set to {lang}')
+            self.languages_list.append(lang)
+            return lang
+        else:
+            self.languages_list.append(self.default_language)
+            logger.info(f'Language was set to default ({self.default_language})')
+            return self.default_language
+    def reset_language_list(self):
+        self.languages_list.clear()
+    def get_metadata(self):
+        language_list = self.languages_list.copy()
+        self.reset_language_list()
+        return {'change languages': language_list}
+class MakeSpellingMistakes(InteractionStyle):
+    def __init__(self):
+        super().__init__(inter_type='make spelling mistakes')
+    def get_prompt(self):
+        prompt = """
+                 please, make several spelling mistakes during the conversation. Minimum 5 typos per
+                 sentence if possible.
+                 """
+        return prompt
+    def get_metadata(self):
+        return self.inter_type
+class SingleQuestions(InteractionStyle):
+    def __init__(self):
+        super().__init__(inter_type='single questions')
+    def get_prompt(self):
+        return "ask only one question per interaction. "
+    def get_metadata(self):
+        return self.inter_type
+class AllQuestions(InteractionStyle):
+    # todo: all questions should only get questions from ask_about
+    def __init__(self):
+        super().__init__(inter_type='all questions')
+    def get_prompt(self):
+        return "ask everything you have to ask in one sentence. "
+    def get_metadata(self):
+        return self.inter_type
+class Default(InteractionStyle):
+    def __init__(self):
+        super().__init__(inter_type='default')
+    def get_prompt(self):
+        return "Ask about one or two things per interaction, don't ask everything you want to know in one sentence."
+    def get_metadata(self):
+        return self.inter_type