PyPI - user-simulator - Versions diffs - 0.1.0__py3-none-any.whl - Mend

user-simulator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

user_sim/__init__.py +0 -0
user_sim/cli/__init__.py +0 -0
user_sim/cli/gen_user_profile.py +34 -0
user_sim/cli/init_project.py +65 -0
user_sim/cli/sensei_chat.py +481 -0
user_sim/cli/sensei_check.py +103 -0
user_sim/cli/validation_check.py +143 -0
user_sim/core/__init__.py +0 -0
user_sim/core/ask_about.py +665 -0
user_sim/core/data_extraction.py +260 -0
user_sim/core/data_gathering.py +134 -0
user_sim/core/interaction_styles.py +147 -0
user_sim/core/role_structure.py +608 -0
user_sim/core/user_simulator.py +302 -0
user_sim/handlers/__init__.py +0 -0
user_sim/handlers/asr_module.py +128 -0
user_sim/handlers/html_parser_module.py +202 -0
user_sim/handlers/image_recognition_module.py +139 -0
user_sim/handlers/pdf_parser_module.py +123 -0
user_sim/utils/__init__.py +0 -0
user_sim/utils/config.py +47 -0
user_sim/utils/cost_tracker.py +153 -0
user_sim/utils/cost_tracker_v2.py +193 -0
user_sim/utils/errors.py +15 -0
user_sim/utils/exceptions.py +47 -0
user_sim/utils/languages.py +78 -0
user_sim/utils/register_management.py +62 -0
user_sim/utils/show_logs.py +63 -0
user_sim/utils/token_cost_calculator.py +338 -0
user_sim/utils/url_management.py +60 -0
user_sim/utils/utilities.py +568 -0
user_simulator-0.1.0.dist-info/METADATA +733 -0
user_simulator-0.1.0.dist-info/RECORD +37 -0
user_simulator-0.1.0.dist-info/WHEEL +5 -0
user_simulator-0.1.0.dist-info/entry_points.txt +6 -0
user_simulator-0.1.0.dist-info/licenses/LICENSE.txt +21 -0
user_simulator-0.1.0.dist-info/top_level.txt +1 -0

user_sim/utils/exceptions.py ADDED Viewed

@@ -0,0 +1,47 @@
+class InvalidGoalException(Exception):
+    pass
+class InvalidInteractionException(Exception):
+    pass
+class InvalidLanguageException(Exception):
+    pass
+class OutOfLimitException(Exception):
+    pass
+class BadDictionaryGeneration(Exception):
+    pass
+class InvalidItemType(Exception):
+    pass
+class EmptyListExcept(Exception):
+    pass
+class InvalidDataType(Exception):
+    pass
+class InvalidFormat(Exception):
+    pass
+class MissingStepDefinition(Exception):
+    pass
+class InvalidGenerator(Exception):
+    pass
+class VariableNotFound(Exception):
+    pass
+class InvalidDependence(Exception):
+    pass
+class InvalidFile(Exception):
+    pass
+class NoCostException(Exception):
+    pass
+class UnmachedList(Exception):
+    pass

user_sim/utils/languages.py ADDED Viewed

@@ -0,0 +1,78 @@
+languages = [
+    "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", "Bengali", "Bosnian", "Bulgarian",
+    "Catalan", "Chinese (Simplified)", "Chinese (Traditional)", "Croatian", "Czech", "Danish", "Dutch",
+    "English", "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian", "German", "Greek",
+    "Gujarati", "Hausa", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian", "Italian", "Japanese",
+    "Kannada", "Kazakh", "Korean", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam", "Marathi",
+    "Nepali", "Norwegian", "Persian", "Polish", "Portuguese", "Punjabi", "Romanian", "Russian", "Serbian",
+    "Slovak", "Slovenian", "Spanish", "Swahili", "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian",
+    "Urdu", "Vietnamese", "Zulu"
+]
+languages_weights = {
+    "Afrikaans": 1,
+    "Albanian": 1,
+    "Amharic": 1,
+    "Arabic": 1,
+    "Armenian": 1,
+    "Azerbaijani": 1,
+    "Bengali": 1,
+    "Bosnian": 1,
+    "Bulgarian": 1,
+    "Catalan": 1,
+    "Chinese (Simplified)": 1,
+    "Chinese (Traditional)": 1,
+    "Croatian": 1,
+    "Czech": 1,
+    "Danish": 1,
+    "Dutch": 1,
+    "English": 1,
+    "Estonian": 1,
+    "Filipino": 1,
+    "Finnish": 1,
+    "French": 1,
+    "Galician": 1,
+    "Georgian": 1,
+    "German": 1,
+    "Greek": 1,
+    "Gujarati": 1,
+    "Hausa": 1,
+    "Hebrew": 1,
+    "Hindi": 1,
+    "Hungarian": 1,
+    "Icelandic": 1,
+    "Indonesian": 1,
+    "Italian": 1,
+    "Japanese": 1,
+    "Kannada": 1,
+    "Kazakh": 1,
+    "Korean": 1,
+    "Latvian": 1,
+    "Lithuanian": 1,
+    "Macedonian": 1,
+    "Malay": 1,
+    "Malayalam": 1,
+    "Marathi": 1,
+    "Nepali": 1,
+    "Norwegian": 1,
+    "Persian": 1,
+    "Polish": 1,
+    "Portuguese": 1,
+    "Punjabi": 1,
+    "Romanian": 1,
+    "Russian": 1,
+    "Serbian": 1,
+    "Slovak": 1,
+    "Slovenian": 1,
+    "Spanish": 1,
+    "Swahili": 1,
+    "Swedish": 1,
+    "Tamil": 1,
+    "Telugu": 1,
+    "Thai": 1,
+    "Turkish": 1,
+    "Ukrainian": 1,
+    "Urdu": 1,
+    "Vietnamese": 1,
+    "Zulu": 1
+}

user_sim/utils/register_management.py ADDED Viewed

@@ -0,0 +1,62 @@
+import os
+import json
+import hashlib
+import logging
+current_script_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.abspath(os.path.join(current_script_dir, "../../..")) #change
+temp_file_dir = os.path.join(project_root, "data/cache")
+logger = logging.getLogger('Info Logger')
+def save_register(register, name):
+    path = os.path.join(temp_file_dir, name)
+    with open(path, "w", encoding="utf-8") as file:
+        json.dump(register, file, ensure_ascii=False, indent=4)
+def load_register(register_name):
+    register_path = os.path.join(temp_file_dir, register_name)
+    if not os.path.exists(temp_file_dir):
+        os.makedirs(temp_file_dir)
+        return {}
+    else:
+        if not os.path.exists(register_path):
+            with open(register_path, 'w',  encoding="utf-8") as file:
+                json.dump({}, file, ensure_ascii=False, indent=4)
+            return {}
+        else:
+            with open(register_path, 'r', encoding="utf-8") as file:
+                hash_reg = json.load(file)
+            return hash_reg
+def hash_generate(content_type=None, hasher=hashlib.md5(), **kwargs):
+    if content_type == "pdf":
+        hasher = hashlib.md5()
+        with open(kwargs.get("content",""), 'rb') as pdf_file:
+            buf = pdf_file.read()
+            hasher.update(buf)
+        return hasher.hexdigest()
+    else:
+        content = kwargs.get('content', '')
+        if isinstance(content, str):
+            hasher.update(content.encode("utf-8"))
+        else:
+            hasher.update(content)
+        return hasher.hexdigest()
+def clear_register(register_name):
+    try:
+        path = os.path.join(temp_file_dir, register_name)
+        with open(path, 'w') as file:
+            json.dump({}, file)
+    except Exception as e:
+        logger.error("Couldn't clear cache because the cache file was not created during the execution.")
+def clean_temp_files():
+    clear_register("image_register.json")
+    clear_register("pdf_register.json")
+    clear_register("webpage_register.json")

user_sim/utils/show_logs.py ADDED Viewed

@@ -0,0 +1,63 @@
+import logging
+import sys
+import colorama
+# Initialize colorama
+colorama.init(autoreset=True)
+# Define color codes
+RESET = colorama.Style.RESET_ALL
+BLACK = colorama.Fore.BLACK
+RED = colorama.Fore.RED
+GREEN = colorama.Fore.GREEN
+YELLOW = colorama.Fore.YELLOW
+BLUE = colorama.Fore.BLUE
+MAGENTA = colorama.Fore.MAGENTA
+CYAN = colorama.Fore.CYAN
+WHITE = colorama.Fore.WHITE
+class ColoredFormatter(logging.Formatter):
+    # Mapping of log levels to colors
+    LEVEL_COLORS = {
+        logging.DEBUG: CYAN,
+        logging.INFO: GREEN,
+        logging.WARNING: YELLOW,
+        logging.ERROR: RED,
+        logging.CRITICAL: MAGENTA,
+    }
+    def format(self, record):
+        # Get the color for the current log level
+        level_color = self.LEVEL_COLORS.get(record.levelno, WHITE)
+        # Apply the color to the level name and message
+        record.levelname = f"{level_color}{record.levelname}{RESET}"
+        record.msg = f"{level_color}{record.msg}{RESET}"
+        # Format the message
+        return super().format(record)
+def create_logger(verbose, name=None):
+    if name:
+        my_logger = logging.getLogger(name)
+    else:
+        my_logger = logging.getLogger()
+    if verbose:
+        my_logger.setLevel(logging.DEBUG)
+    else:
+        my_logger.setLevel(logging.CRITICAL)
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(logging.DEBUG)
+    log_format = ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    # log_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(log_format)
+    my_logger.addHandler(console_handler)
+    return my_logger

user_sim/utils/token_cost_calculator.py ADDED Viewed

@@ -0,0 +1,338 @@
+import re
+import os
+import base64
+import tiktoken
+import requests
+import pandas as pd
+import logging
+from io import BytesIO
+from PIL import Image
+from langchain_core.output_parsers import StrOutputParser
+from user_sim.utils import config
+from user_sim.utils.utilities import get_encoding
+logger = logging.getLogger('Info Logger')
+columns = ["Conversation", "Test Name", "Module", "Model", "Total Cost",
+           "Timestamp", "Input Cost", "Input Message",
+           "Output Cost", "Output Message"]
+PRICING = {
+    "gpt-4o": {"input": 2.5 / 10**6, "output": 10 / 10**6},
+    "gpt-4o-mini": {"input": 0.15 / 10**6, "output": 0.6 / 10**6},
+    "whisper": 0.006/60,
+    "tts-1": 0.0015/1000,  # (characters, not tokens)
+    "gemini-2.0-flash": 0
+}
+TOKENS = {
+    "gpt-4o": {"input": 10**6/2.5, "output": 10**6/10},
+    "gpt-4o-mini": {"input": 10**6/0.15, "output": 10**6/0.6},
+    "whisper": 60/0.006,
+    "tts-1": 1000/0.0015,  # (characters, not tokens)
+    "gemini-2.0-flash": 0
+}
+MAX_MODEL_TOKENS = {
+    "gpt-4o": 16384,
+    "gpt-4o-mini": 16384,
+    "gemini-2.0-flash": 10000000
+}
+DEFAULT_COSTS = {
+    # OpenAI models costs per 1M tokens
+    "gpt-4o": {"prompt": 5.00, "completion": 20.00},
+    "gpt-4o-mini": {"prompt": 0.60, "completion": 2.40},
+    "gpt-4.1": {"prompt": 2.00, "completion": 8.00},
+    "gpt-4.1-mini": {"prompt": 0.40, "completion": 1.60},
+    "gpt-4.1-nano": {"prompt": 0.10, "completion": 0.40},
+    # Google/Gemini models costs per 1M tokens
+    "gemini-2.0-flash": {"prompt": 0.10, "completion": 0.40},
+    "gemini-2.5-flash-preview-05-2023": {"prompt": 0.15, "completion": 0.60},
+    # Default fallback rates if model not recognized
+    "default": {"prompt": 0.10, "completion": 0.40},
+}
+def create_cost_dataset(serial, test_cases_folder):
+    folder = f"{test_cases_folder}/reports/__cost_reports__"
+    file = f"cost_report_{serial}.csv"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+        logger.info(f"Created cost report folder at: {folder}")
+    path = f"{folder}/{file}"
+    cost_df = pd.DataFrame(columns=columns)
+    cost_df.to_csv(path, index=False)
+    config.cost_ds_path = path
+    logger.info(f"Cost dataframe created at {path}.")
+def count_tokens(text, model="gpt-4o-mini"):
+    try:
+        # First try to use the model name directly with tiktoken
+        encoding = tiktoken.encoding_for_model(model)
+    except (KeyError, ValueError):
+        # If tiktoken doesn't recognize the model, use cl100k_base encoding
+        # which is used for GPT-4 family models including gpt-4o and gpt-4o-mini
+        logger.warning(
+            f"Model '{model}' not recognized by tiktoken, using cl100k_base encoding"
+        )
+        encoding = tiktoken.get_encoding("cl100k_base")
+    return len(encoding.encode(text))
+def calculate_text_cost(tokens, model="gpt-4o-mini", io_type="input"):
+    cost = tokens * PRICING[model][io_type]
+    return cost
+def calculate_image_cost(image):
+    def get_dimensions(image_input):
+        try:
+            if isinstance(image_input, bytes):
+                image_input = image_input.decode('utf-8')
+            if re.match(r'^https?://', image_input) or re.match(r'^http?://', image_input):  # Detects if it's a URL
+                response = requests.get(image_input)
+                response.raise_for_status()  #
+                image = Image.open(BytesIO(response.content))
+            else:
+                decoded_image = base64.b64decode(image_input)
+                image = Image.open(BytesIO(decoded_image))
+            # Get the dimensions
+            w, h = image.size
+            return w, h
+        except Exception as e:
+            logger.error(e)
+            return None
+    dimensions = get_dimensions(image)
+    if dimensions is None:
+        logger.warning("Couldn't get image dimensions.")
+        return None
+    width, height = dimensions
+    # Initial configuration
+    price_per_million_tokens = 0.15
+    tokens_per_tile = 5667
+    base_tokens = 2833
+    # Calculate the number of tiles needed (512 x 512 pixels)
+    horizontal_tiles = (width + 511) // 512
+    vertical_tiles = (height + 511) // 512
+    total_tiles = horizontal_tiles * vertical_tiles
+    # Calculate the total tokens
+    total_tokens = base_tokens + (tokens_per_tile * total_tiles)
+    # Convert tokens to price
+    total_price = (total_tokens / 1_000_000) * price_per_million_tokens
+    return total_price
+# VISION
+def input_vision_module_cost(input_message, image, model):
+    input_tokens = count_tokens(input_message, model)
+    image_cost = calculate_image_cost(image)
+    if image_cost is None:
+        logger.warning("Image cost set to $0.")
+        image_cost = 0
+    model_pricing = PRICING[model]
+    input_cost = input_tokens * model_pricing["input"] + image_cost
+    return input_cost
+def output_vision_module_cost(output_message, model):
+    output_tokens = count_tokens(output_message, model)
+    model_pricing = PRICING[model]
+    output_cost = output_tokens * model_pricing["output"]
+    return output_cost
+# TTS-STT
+def input_tts_module_cost(input_message, model):
+    model_pricing = PRICING[model]
+    input_cost = len(input_message) * model_pricing
+    return input_cost
+def whisper_module_cost(audio_length, model):
+    audio_length = audio_length
+    model_pricing = PRICING[model]
+    input_cost = audio_length * model_pricing
+    return input_cost
+# TEXT
+def input_text_module_cost(input_message, model):
+    if isinstance(input_message, list):
+        input_message = ", ".join(input_message)
+    input_tokens = count_tokens(input_message, model)
+    model_pricing = PRICING[model]
+    input_cost = input_tokens * model_pricing["input"]
+    return input_cost
+def output_text_module_cost(output_message, model):
+    if isinstance(output_message, list):
+        output_message = ", ".join(output_message)
+    output_tokens = count_tokens(output_message, model)
+    model_pricing = PRICING[model]
+    output_cost = output_tokens * model_pricing["output"]
+    return output_cost
+def calculate_cost(input_message='', output_message='', model="gpt-4o", module=None, **kwargs):
+    # input_tokens = count_tokens(input_message, model)
+    # output_tokens = count_tokens(output_message, model)
+    if input_message is None:
+        input_message = ""
+    if output_message is None:
+        output_message = ""
+    if model not in PRICING:
+        raise ValueError(f"Pricing not available for model: {model}")
+    if model == "whisper":
+        input_cost = 0
+        output_cost = whisper_module_cost(kwargs.get("audio_length", None), model)
+        total_cost = output_cost
+    elif model == "tts-1":
+        input_cost = input_tts_module_cost(input_message, model)
+        output_cost = 0
+        total_cost = input_cost
+    elif kwargs.get("image", None):
+        input_cost = input_vision_module_cost(input_message, kwargs.get("image", None), model)
+        output_cost = output_vision_module_cost(output_message, model)
+        total_cost = input_cost + output_cost
+    else:
+        input_cost = input_text_module_cost(input_message, model)
+        output_cost = output_text_module_cost(output_message, model)
+        total_cost = input_cost + output_cost
+    def update_dataframe():
+        new_row = {"Conversation": config.conversation_name, "Test Name": config.test_name, "Module": module,
+                   "Model": model, "Total Cost": total_cost, "Timestamp": pd.Timestamp.now(),
+                   "Input Cost": input_cost, "Input Message": input_message,
+                   "Output Cost": output_cost, "Output Message": output_message}
+        encoding = get_encoding(config.cost_ds_path)["encoding"]
+        cost_df = pd.read_csv(config.cost_ds_path, encoding=encoding)
+        cost_df.loc[len(cost_df)] = new_row
+        cost_df.to_csv(config.cost_ds_path, index=False)
+        config.total_cost = config.total_individual_cost = float(cost_df['Total Cost'].sum())
+        logger.info(f"Updated 'cost_report' dataframe with new cost from {module}.")
+    update_dataframe()
+def get_cost_report(test_cases_folder):
+    export_path = test_cases_folder + f"/reports/__cost_report__"
+    serial = config.serial
+    if not os.path.exists(export_path):
+        os.makedirs(export_path)
+    export_file_name = export_path + f"/report_{serial}.csv"
+    encoding = get_encoding(config.cost_ds_path)["encoding"]
+    temp_cost_df = pd.read_csv(config.cost_ds_path, encoding=encoding)
+    temp_cost_df.to_csv(export_file_name, index=False)
+def max_input_tokens_allowed(text='', model_used='gpt-4o-mini', **kwargs):
+    def get_delta_verification(sim_cost, sim_ind_cost):
+        delta_cost = config.limit_cost - sim_cost
+        delta_individual_cost = config.limit_individual_cost - sim_ind_cost
+        logger.info(f"${delta_cost} for global and ${delta_individual_cost} for individual input cost left.")
+        return True if delta_cost <= 0 or delta_individual_cost <= 0 else False
+    if config.token_count_enabled:
+        if kwargs.get("image", None):
+            input_cost = input_vision_module_cost(text, kwargs.get("image", 0), model_used)
+            simulated_cost = input_cost + config.total_cost
+            simulated_individual_cost = input_cost + config.total_individual_cost
+            return get_delta_verification(simulated_cost, simulated_individual_cost)
+        elif model_used == "tts-1":
+            input_cost = input_tts_module_cost(text, model_used)
+            simulated_cost = input_cost + config.total_cost
+            simulated_individual_cost = input_cost + config.total_individual_cost
+            return get_delta_verification(simulated_cost, simulated_individual_cost)
+        elif model_used == "whisper":
+            input_cost = whisper_module_cost(kwargs.get("audio_length", 0), model_used)
+            simulated_cost = input_cost + config.total_cost
+            simulated_individual_cost = input_cost + config.total_individual_cost
+            return get_delta_verification(simulated_cost, simulated_individual_cost)
+        else:
+            input_cost = input_text_module_cost(text, model_used)
+            simulated_cost = input_cost + config.total_cost
+            simulated_individual_cost = input_cost + config.total_individual_cost
+            return get_delta_verification(simulated_cost, simulated_individual_cost)
+    else:
+        return False
+def max_output_tokens_allowed(model_used):
+    if config.token_count_enabled:
+        delta_cost = config.limit_cost - config.total_cost
+        delta_individual_cost = config.limit_individual_cost - config.total_individual_cost
+        delta = min([delta_cost, delta_individual_cost])
+        output_tokens = round(delta * TOKENS[model_used]["output"])
+        if MAX_MODEL_TOKENS[model_used]<output_tokens:
+            output_tokens = MAX_MODEL_TOKENS[model_used]
+        logger.info(f"{output_tokens} output tokens left.")
+        return output_tokens
+    else:
+        return
+def invoke_llm(llm, prompt, input_params, model, module, parser=False):
+    # Outputs input messages as text.
+    if isinstance(input_params, dict):
+        messages = list(input_params.values())
+        parsed_messages = " ".join(messages)
+    else:
+        parsed_messages = input_params
+    # Measures max input tokens allowed by the execution
+    if config.token_count_enabled and max_input_tokens_allowed(parsed_messages, model):
+        logger.error(f"Token limit was surpassed in {module} module")
+        return None
+    # Calculates the amount of tokens left and updates the LLM max_tokens parameter
+    if config.token_count_enabled:
+        llm.max_tokens = max_output_tokens_allowed(model)
+    # Enables str output parser
+    if parser:
+        parser = StrOutputParser()
+        llm_chain = prompt | llm | parser
+    else:
+        llm_chain = prompt | llm
+    # Invoke LLM
+    try:
+        response = llm_chain.invoke(input_params)
+        if config.token_count_enabled:
+            calculate_cost(parsed_messages, response, model, module="user_simulator")
+    except Exception as e:
+        logger.error(e)
+        response = None
+    if response is None and module == "user_simulator":
+        response = "exit"
+    return response

user_sim/utils/url_management.py ADDED Viewed

@@ -0,0 +1,60 @@
+import re
+from typing import List, Dict
+from user_sim.handlers.pdf_parser_module import pdf_processor
+from user_sim.handlers.image_recognition_module import image_description
+from user_sim.handlers.html_parser_module import webpage_reader
+def classify_links(message: str) -> Dict[str, List[str]]:
+    url_pattern = re.compile(r'https?://\S+')  # Capture URLs
+    links = url_pattern.findall(message)
+    classified_links = {
+        "images": [],
+        "pdfs": [],
+        "webpages": []
+    }
+    for link in links:
+        if re.search(r'\.(jpg|jpeg|png|gif|webp|bmp|tiff)$', link, re.IGNORECASE) or '<image>' in message:
+            clean_link = re.sub(r'</?image>', '', link)
+            classified_links["images"].append(clean_link)
+        elif re.search(r'\.pdf$', link, re.IGNORECASE) or 'application/pdf' in message:
+            classified_links["pdfs"].append(link)
+        else:
+            classified_links["webpages"].append(link)
+    return classified_links
+def process_with_llm(link: str, category) -> str:
+    if category == "pdfs":
+        description = pdf_processor(link)
+        message_replacement = f"{link} {description}"
+        return message_replacement
+    elif category == "images":
+        description = image_description(link, detailed=True)
+        message_replacement = f"{link} {description}"
+        return message_replacement
+    else:
+        description = webpage_reader(link)
+        message_replacement = f"{link} {description}"
+        return message_replacement
+def get_content(message: str) -> str:
+    classified_links = classify_links(message)
+    for category in classified_links:
+        for link in classified_links[category]:
+            description = process_with_llm(link, category)
+            message = message.replace(link, description)
+    return message
+# def clean_temp_files():
+#     clear_pdf_register()
+#     clear_image_register()
+#     clear_webpage_register()