PyPI - h-adminsim - Versions diffs - 1.0.0__py3-none-any.whl - Mend

h-adminsim 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

h_adminsim/__init__.py +5 -0
h_adminsim/admin_staff.py +280 -0
h_adminsim/assets/configs/data4primary.yaml +47 -0
h_adminsim/assets/configs/data4secondary.yaml +47 -0
h_adminsim/assets/configs/data4tertiary.yaml +47 -0
h_adminsim/assets/country/address.json +141859 -0
h_adminsim/assets/country/country_code.json +244 -0
h_adminsim/assets/departments/department.json +85 -0
h_adminsim/assets/departments/symptom.json +4530 -0
h_adminsim/assets/fhir.schema.json +75253 -0
h_adminsim/assets/names/firstname.txt +1219 -0
h_adminsim/assets/names/lastname.txt +88799 -0
h_adminsim/assets/prompts/cancel_patient_system.txt +38 -0
h_adminsim/assets/prompts/intake_staff_task_user.txt +16 -0
h_adminsim/assets/prompts/intake_supervisor_system.txt +8 -0
h_adminsim/assets/prompts/intake_supervisor_user.txt +31 -0
h_adminsim/assets/prompts/reschedule_patient_system.txt +38 -0
h_adminsim/assets/prompts/schedule_patient_rejected_system.txt +42 -0
h_adminsim/assets/prompts/schedule_patient_system.txt +36 -0
h_adminsim/assets/prompts/schedule_staff_reasoning.txt +57 -0
h_adminsim/assets/prompts/schedule_staff_sc_tool_calling.txt +13 -0
h_adminsim/assets/prompts/schedule_staff_system.txt +10 -0
h_adminsim/assets/prompts/schedule_staff_tool_calling.txt +41 -0
h_adminsim/client/__init__.py +3 -0
h_adminsim/client/google_client.py +209 -0
h_adminsim/client/openai_client.py +199 -0
h_adminsim/client/vllm_client.py +160 -0
h_adminsim/environment/__init__.py +1 -0
h_adminsim/environment/hospital.py +462 -0
h_adminsim/environment/op_scheduling_simulation.py +1126 -0
h_adminsim/pipeline/__init__.py +3 -0
h_adminsim/pipeline/data_generator.py +192 -0
h_adminsim/pipeline/evaluator.py +33 -0
h_adminsim/pipeline/simulation.py +231 -0
h_adminsim/registry/__init__.py +5 -0
h_adminsim/registry/errors.py +89 -0
h_adminsim/registry/models.py +126 -0
h_adminsim/registry/phrases.py +10 -0
h_adminsim/registry/pydantic_models.py +21 -0
h_adminsim/registry/variables.py +9 -0
h_adminsim/supervisor.py +182 -0
h_adminsim/task/agent_task.py +900 -0
h_adminsim/task/fhir_manager.py +222 -0
h_adminsim/task/schedule_assign.py +151 -0
h_adminsim/tools/__init__.py +5 -0
h_adminsim/tools/agent_data_builder.py +124 -0
h_adminsim/tools/data_converter.py +536 -0
h_adminsim/tools/data_synthesizer.py +365 -0
h_adminsim/tools/evaluator.py +258 -0
h_adminsim/tools/sanity_checker.py +216 -0
h_adminsim/tools/scheduling_rule.py +420 -0
h_adminsim/utils/__init__.py +136 -0
h_adminsim/utils/common_utils.py +698 -0
h_adminsim/utils/fhir_utils.py +190 -0
h_adminsim/utils/filesys_utils.py +135 -0
h_adminsim/utils/image_preprocess_utils.py +188 -0
h_adminsim/utils/random_utils.py +358 -0
h_adminsim/version.txt +1 -0
h_adminsim-1.0.0.dist-info/LICENSE +30 -0
h_adminsim-1.0.0.dist-info/METADATA +494 -0
h_adminsim-1.0.0.dist-info/RECORD +62 -0
h_adminsim-1.0.0.dist-info/WHEEL +4 -0

h_adminsim/tools/data_synthesizer.py ADDED Viewed

@@ -0,0 +1,365 @@
+import os
+import random
+from tqdm import tqdm
+from importlib import resources
+from typing import Optional, Tuple
+from decimal import Decimal, getcontext
+from h_adminsim.task.schedule_assign import ScheduleAssigner
+from h_adminsim.utils import Information, log, colorstr
+from h_adminsim.utils.common_utils import *
+from h_adminsim.utils.filesys_utils import json_load, txt_load, yaml_save, make_project_dir, json_save_fast
+from h_adminsim.utils.random_utils import (
+    generate_random_prob,
+    generate_random_date,
+    generate_random_code,
+    generate_random_names,
+    generate_random_address,
+    generate_random_telecom,
+    generate_random_id_number,
+    generate_random_specialty,
+    generate_random_code_with_prob,
+)
+class DataSynthesizer:
+    def __init__(self, config):
+        # Initialize configuration, path and save directory
+        self.config = config
+        self._n = self.config.hospital_data.hospital_n
+        self._save_dir = make_project_dir(self.config)
+        self._data_save_dir = self._save_dir / 'data'
+        yaml_save(self._save_dir / 'args.yaml', self.config)
+        os.makedirs(self._data_save_dir, exist_ok=True)
+        getcontext().prec = 10
+    def synthesize(self,
+                   return_obj: bool = False,
+                   sanity_check: bool = False) -> Tuple[list[Information], list[Hospital]]:
+        """
+        Synthesize hospital data based on the configuration settings.
+        Args:
+            return_obj (bool, optional): Whether to return the hospital data object.
+            sanity_check (bool, optional): If you want to check whether the generated data are compatible with the `Hospital` object,
+                                 you can use this option.
+        Raises:
+            e: Exception if data synthesis fails.
+        Returns:
+            Tuple[list[Information], list[Hospital]]: A tuple containing the synthesized hospital data as an Information object and a Hospital object.
+        """
+        if sanity_check:
+            return_obj = True
+        try:
+            all_data, all_hospitals = list(), list()
+            hospitals = DataSynthesizer.hospital_list_generator(self.config.hospital_data.hospital_n)
+            for i, hospital in tqdm(enumerate(hospitals), desc='Synthesizing data..', total=len(hospitals)):
+                data = DataSynthesizer.define_hospital_info(self.config, hospital)
+                hospital_obj = convert_info_to_obj(data) if return_obj else None
+                if sanity_check:
+                    new_data = convert_obj_to_info(hospital_obj)
+                    assert to_dict(data) == to_dict(new_data)
+                json_save_fast(self._data_save_dir / f'hospital_{padded_int(i, len(str(self._n)))}.json', to_dict(data))
+                all_data.append(data)
+                all_hospitals.append(hospital_obj)
+            log(f"Total {len(hospitals)} data synthesizing completed. Path: `{self._data_save_dir}`", color=True)
+            return all_data, all_hospitals
+        except Exception as e:
+            log(f"Data synthesizing failed: {e}", level='error')
+            raise e
+    @staticmethod
+    def define_hospital_info(config, hospital_name: str) -> Information:
+        """
+        Define the synthetic hospital data, including its departments and doctors.
+        Args:
+            config: Configuration object containing hospital data settings.
+            hospital_name (str): Name of the hospital to be defined.
+        Returns:
+            Information: Synthetic data about the hospital.
+        """
+        # Define hosptial metadata
+        days = config.hospital_data.days
+        dates = generate_date_range(
+            generate_random_iso_date_between(
+                str(config.hospital_data.start_date.min),
+                str(config.hospital_data.start_date.max),
+            ),
+            days
+        )
+        interval_hour = float(config.hospital_data.interval_hour)
+        start_hour = float(random.randint(config.hospital_data.start_hour.min, config.hospital_data.start_hour.max))
+        end_hour = float(random.randint(config.hospital_data.end_hour.min, config.hospital_data.end_hour.max))
+        operation_hour_per_day = int(end_hour - start_hour)
+        department_n = random.randint(
+            config.hospital_data.department_per_hospital.min,
+            config.hospital_data.department_per_hospital.max
+        )
+        doctor_n_per_department = [random.randint(config.hospital_data.doctor_per_department.min, config.hospital_data.doctor_per_department.max)
+                                   for _ in range(department_n)]
+        doctor_n = sum(doctor_n_per_department)
+        doctor_capacity_per_hour_list = [c for c in range(config.hospital_data.doctor_capacity_per_hour.min, config.hospital_data.doctor_capacity_per_hour.max + 1) \
+                                         if float(Decimal(str(1))/Decimal(str(c)) % Decimal(str(interval_hour))) == 0]
+        hospital_time_segments = convert_time_to_segment(start_hour, end_hour, interval_hour)
+        metadata = Information(
+            hospital_name=hospital_name,
+            start_date=dates[0],
+            end_date=dates[-1],
+            days=days,
+            department_num=department_n,
+            doctor_num=doctor_n,
+            time=Information(
+                start_hour=start_hour,
+                end_hour=end_hour,
+                interval_hour=interval_hour
+            )
+        )
+        # Define ScheduleAssigner class to randomly assign schedules to each doctor
+        scheduler = ScheduleAssigner(start_hour, end_hour, interval_hour)
+        # Define detailed hospital department, doctoral, and patient information
+        department_info, doctor_info, patient_info = dict(), dict(), dict()
+        departments = DataSynthesizer.department_list_generator(department_n)
+        doctors = DataSynthesizer.name_list_generator(doctor_n, prefix='Dr. ')   # Doctor names are unique across all departments
+        for department_data, doc_n in zip(departments, doctor_n_per_department):
+            department, dep_code = department_data
+            # Add department information
+            department_info[department] = {'code': dep_code if dep_code else 'NA', 'doctor': []}
+            # Add doctor information
+            for _ in range(doc_n):
+                doctor = doctors.pop()
+                department_info[department]['doctor'].append(doctor)
+                specialty, spe_code = generate_random_specialty(department)
+                capacity_per_hour = random.choice(doctor_capacity_per_hour_list)
+                working_days = random.randint(
+                    config.hospital_data.working_days.min,
+                    config.hospital_data.working_days.max
+                )
+                working_dates = sorted(random.sample(dates, working_days))
+                doctor_info[doctor] = {
+                    'department': department,
+                    'specialty': {
+                        'name': specialty,
+                        'code': spe_code,
+                    },
+                    'schedule': {},
+                    'capacity_per_hour': int(capacity_per_hour),
+                    'capacity': int(capacity_per_hour * operation_hour_per_day * len(working_dates)),
+                    'gender': generate_random_code('gender'),
+                    'telecom': [{
+                        'system': 'phone',
+                        'value': generate_random_telecom(),
+                        'use': generate_random_code('use')
+                    }],
+                    'birthDate': generate_random_date()
+                }
+                duration = int(1 / capacity_per_hour / interval_hour)
+                # Generate doctor schedules and apponitments based on the pre-defined days
+                for date in dates:
+                    # Working day case
+                    if date in working_dates:
+                        schedule_segments, schedule_times = scheduler(
+                            generate_random_prob(
+                                config.hospital_data.doctor_has_schedule_prob,
+                                config.hospital_data.schedule_coverage_ratio.min,
+                                config.hospital_data.schedule_coverage_ratio.max
+                            )
+                        )
+                        doctor_info[doctor]['schedule'][date] = schedule_times
+                    # Not working day case
+                    else:
+                        schedule_segments, schedule_times = scheduler(1)
+                        doctor_info[doctor]['schedule'][date] = schedule_times
+                    # Add patient information per doctor
+                    patient_segments = list(set(hospital_time_segments) - set(sum(schedule_segments, [])))
+                    _, appointments = scheduler(
+                        generate_random_prob(
+                            1,
+                            config.hospital_data.appointment_coverage_ratio.min,
+                            config.hospital_data.appointment_coverage_ratio.max
+                        ),
+                        True,
+                        patient_segments,
+                        min_chunk_size=duration,
+                        max_chunk_size=duration
+                    )
+                    patients = DataSynthesizer.name_list_generator(len(appointments))
+                    for patient, appointment in zip(patients, appointments):
+                        preference = generate_random_code_with_prob(
+                            config.hospital_data.preference.type,
+                            config.hospital_data.preference.probs
+                        )
+                        preference_rank = DataSynthesizer.second_preference_generator(preference)
+                        symptom_level = generate_random_code_with_prob(
+                            config.hospital_data.symptom.type,
+                            config.hospital_data.symptom.probs
+                        )
+                        birth_date =  generate_random_date()
+                        patient_info[patient] = {
+                            'department': department,
+                            'attending_physician': doctor,
+                            'date': date,
+                            'schedule': appointment,
+                            'preference': preference_rank,
+                            'symptom_level': symptom_level,
+                            'gender': generate_random_code('gender'),
+                            'telecom': [{
+                                'system': 'phone',
+                                'value': generate_random_telecom(),
+                                'use': generate_random_code('use')
+                            }],
+                            'birthDate': birth_date,
+                            'identifier': [{
+                                'value': generate_random_id_number(birth_date=birth_date),
+                                'use': 'official'
+                            }],
+                            'address': [{
+                                'type': 'postal',
+                                'text': generate_random_address(),
+                                'use': 'home'
+                            }]
+                        }
+        # Finalize data structure
+        data = Information(
+            metadata=metadata,
+            department=department_info,
+            doctor=doctor_info,
+            patient=patient_info,
+        )
+        # Data sanity check
+        if len(data.department) != metadata.department_num:
+            raise AssertionError(colorstr('red', 'Department number mismatch'))
+        if len(data.department) != len(set(doc['department'] for doc in data.doctor.values())):
+            raise AssertionError(colorstr('red', 'Department number mismatch'))
+        if len(data.doctor) != metadata.doctor_num:
+            raise AssertionError(colorstr('red', 'Doctor number mismatch'))
+        if len(data.doctor) != sum(len(dept['doctor']) for dept in data.department.values()):
+            raise AssertionError(colorstr('red', 'Doctor number mismatch'))
+        return data
+    @staticmethod
+    def hospital_list_generator(hospital_n: int,
+                                file_path: Optional[str] = None) -> list[str]:
+        """
+        Generate a list of hospital names based on the number of hospitals.
+        Args:
+            hospital_n (int): Number of hospitals to generate.
+            file_path (Optional[str], optional): Path to a file containing hospital names. If provided, it will be used to load names.
+        Returns:
+            list[str]: List of hospital names in the format "Hospital 001", "Hospital 002", etc.
+        """
+        if file_path:
+            if registry.HOSPITALS is None:
+                registry.HOSPITALS = [word.capitalize() for word in txt_load(file_path).split('\n') if word.strip()]
+            return [f"{random.choice(registry.HOSPITALS)}" for _ in range(hospital_n)]
+        zfill_l = len(str(hospital_n))
+        return [f"hospital_{padded_int(i, zfill_l)}" for i in range(hospital_n)]
+    @staticmethod
+    def department_list_generator(department_n: int,
+                                  file_path: Optional[str] = None) -> list[Tuple[str, str]]:
+        """
+        Generate a list of department names based on the number of departments.
+        Args:
+            department_n (int): Number of departments to generate.
+            file_path (Optional[str], optional): Path to a file containing department names. If provided, it will be used to load names. Defaults to None.
+        Returns:
+            list[Tuple[str, str]]: List of department names and their codes.
+        """
+        if file_path == None:
+            file_path = str(resources.files("h_adminsim.assets.departments").joinpath("department.json"))
+        if file_path:
+            if registry.DEPARTMENTS is None:
+                specialty = json_load(file_path)['specialty']
+                registry.DEPARTMENTS = [(k2, v2['code']) for v1 in specialty.values() for k2, v2 in v1['subspecialty'].items()]
+            if department_n > len(registry.DEPARTMENTS):
+                raise ValueError(f"Requested {department_n} departments, but only {len(registry.DEPARTMENTS)} available in {file_path}.")
+            return random.sample(registry.DEPARTMENTS, department_n)
+        zfill_l = len(str(department_n))
+        return [(f"department_{padded_int(i, zfill_l)}", None) for i in range(department_n)]
+    @staticmethod
+    def name_list_generator(n: int,
+                            first_name_file_path: Optional[str] = None,
+                            last_name_file_path: Optional[str] = None,
+                            prefix: Optional[str] = None) -> list[str]:
+        """
+        Generate a list of names.
+        Args:
+            n (int): Number of doctors to generate.
+            first_name_file_path (Optional[str], optional): Path to a file containing first names. Defaults to None.
+            last_name_file_path (Optional[str], optional): Path to a file containing last names. Defaults to None.
+            prefix (Optional[str], optional): Prefix for to be generated names.
+        Returns:
+            list[str]: List of names.
+        """
+        if first_name_file_path == None:
+            first_name_file_path = str(resources.files("h_adminsim.assets.names").joinpath("firstname.txt"))
+        if last_name_file_path == None:
+            last_name_file_path = str(resources.files("h_adminsim.assets.names").joinpath("lastname.txt"))
+        if prefix != None:
+            assert isinstance(prefix, str), log("`prefix` must be a string type", "error")
+            names = [f'{prefix}{name}' for name in generate_random_names(n, first_name_file_path, last_name_file_path)]
+        else:
+            names = [name for name in generate_random_names(n, first_name_file_path, last_name_file_path)]
+        random.shuffle(names)
+        return names
+    @staticmethod
+    def second_preference_generator(preference: str) -> list[str]:
+        """
+        Generate a list of preferences based on the initial preference.
+        Args:
+            preference (str): First priority of preference.
+        Returns:
+            list[str]: List of preferences including first and second priority.
+        """
+        preference_list = [preference]
+        if preference == 'doctor':
+            second_preference = random.choice(['asap', 'date'])
+            preference_list.append(second_preference)
+        elif preference == 'date':
+            second_preference = random.choice(['asap', 'doctor'])
+            preference_list.append(second_preference)
+        elif preference == 'asap':
+            second_preference = random.choice(['date', 'doctor'])
+            preference_list.append(second_preference)
+        return preference_list

h_adminsim/tools/evaluator.py ADDED Viewed

@@ -0,0 +1,258 @@
+import os
+import numpy as np
+from collections import Counter
+from h_adminsim.utils import log, colorstr
+from h_adminsim.utils.filesys_utils import get_files, json_load
+from h_adminsim.utils.image_preprocess_utils import draw_fail_donut_subplots
+class Evaluator:
+    def __init__(self, path, human_eval=False):
+        self.path = path
+        self.files = get_files(self.path, '_result.json')
+        if human_eval:
+            self.human_eval_files = get_files(self.path, '.txt')
+        try:
+            self.dialog_files = get_files(self.path, '_dialog.json')
+        except:
+            pass
+    def task_evaluation(self):
+        """
+        Perform micro-wise evaluation on the aggregated results.
+        """
+        aggregated_results = dict()
+        for file in self.files:
+            data = json_load(file)
+            for task, value in data.items():
+                if not task in aggregated_results:
+                    aggregated_results[task] = {'status': [], 'status_code': []}
+                aggregated_results[task]['status'].append(value['status'])
+                aggregated_results[task]['status_code'].append(value['status_code'])
+        # Macro-wise evaluation
+        log('--------------Macro-wise Evaluation--------------')
+        for task, value in aggregated_results.items():
+            accuracies = [sum(x if isinstance(x, bool) else sum(x) for x in status) / sum(1 if isinstance(x, bool) else len(x) for x in status) * 100 for status in value['status']]
+            avg_accuracy = sum(accuracies) / len(accuracies)
+            stdv = round((sum((x - avg_accuracy) ** 2 for x in accuracies) / len(accuracies)) ** 0.5, 2) if len(accuracies) > 1 else 0.0
+            log(f'{colorstr(task):<27} | average accuracy: {colorstr("green", f"{avg_accuracy:.2f}% ± {stdv}")}, files: {len(accuracies)}')
+            log(f'    - Individual accuracies: {", ".join([colorstr("green", f"{acc:.2f}%") for acc in accuracies])}')
+        # Micro-wise evaluation
+        log('')
+        log('--------------Micro-wise Evaluation--------------')
+        fail_data_dict = dict()
+        for task, value in aggregated_results.items():
+            status = [x for y in sum(value['status'], []) for x in (y if isinstance(y, list) or isinstance(y, tuple) else [y])]
+            status_code = [x for y in sum(value['status_code'], []) for x in (y if isinstance(y, list) or isinstance(y, tuple) else [y])]
+            accuracy = sum(status) / len(status) * 100
+            failed_cases = [c for s, c in zip(status, status_code) if not s and 'unexpected' not in c]
+            error_rate = (len(failed_cases) / len(status)) * 100
+            log(f'{colorstr(task):<27} | accuracy: {colorstr("green", f"{accuracy:.2f}%")}, length: {sum(status)} / {len(status)}')
+            log(f'{colorstr(task):<27} | Error   : {colorstr("red", f"{error_rate:.2f}%")}, length: {len(failed_cases)} / {len(status)}')
+            if failed_cases:
+                fail_summary = Counter(failed_cases)
+                reschedule_fail_summary = Counter()
+                for k, v in list(fail_summary.items()):
+                    if k.startswith("reschedule:") and 'identify' not in k and 'unexpected' not in k:
+                        norm_key = k.replace("reschedule:", "").strip()
+                        fail_summary[norm_key] += v
+                        reschedule_fail_summary[norm_key] += v
+                        fail_summary.pop(k)
+                for fail_type, count in fail_summary.items():
+                    percent = (count / len(failed_cases)) * 100
+                    reschedule_n = reschedule_fail_summary[fail_type] if fail_type in reschedule_fail_summary else 0
+                    if reschedule_n:
+                        log(f'    - Fail type {colorstr("red", fail_type):<30}: {count} (reschedule: {reschedule_n}) cases ({percent:.2f}%)')
+                    else:
+                        log(f'    - Fail type {colorstr("red", fail_type):<30}: {count} cases ({percent:.2f}%)')
+                fail_data_dict[task] = failed_cases
+        draw_fail_donut_subplots(fail_data_dict, os.path.join(self.path, 'fails.png'))
+    def ipi_evaluation(self):
+        """
+        Micro-wise IPI performance evaluation on the aggregated results.
+        """
+        aggregated_results = dict()
+        for file in self.files:
+            data = json_load(file)
+            if not 'intake' in aggregated_results:
+                aggregated_results['intake'] = {'status': [], 'status_code': []}
+            aggregated_results['intake']['status'].append(data['intake']['status'])
+            aggregated_results['intake']['status_code'].append(data['intake']['status_code'])
+        # Micro-wise evaluation
+        log('')
+        log('------------------IPI Evaluation-----------------')
+        status = sum(aggregated_results['intake']['status'], [])
+        status_code = sum(aggregated_results['intake']['status_code'], [])
+        failed_cases = [c for s, c in zip(status, status_code) if not s]
+        if failed_cases:
+            if_err_count, ipi_err_count = 0, 0
+            fail_summary = Counter(failed_cases)
+            for fail_type, count in fail_summary.items():
+                if fail_type in ['incorrect department and patient information', 'incorrect patient information']:
+                    ipi_err_count += count
+                elif fail_type in ['incorrect format']:
+                    if_err_count += count
+            if_percent = (if_err_count / len(status)) * 100
+            ipi_percent = (ipi_err_count / len(status)) * 100
+            log(f'    - Fail type {colorstr("red", "incorrect format"):<38}: {if_err_count} / {len(status)} ({if_percent:.2f}%)')
+            log(f'    - Fail type {colorstr("red", "incorrect patient information"):<38}: {ipi_err_count} / {len(status)} ({ipi_percent:.2f}%)')
+    def supervisor_evaluation(self):
+        """
+        Evaluate the supervisor's necessity to intervene in tasks.
+        """
+        aggregated_results = dict()
+        for file in self.files:
+            data = json_load(file)
+            for task, value in data.items():
+                if not task in aggregated_results:
+                    aggregated_results[task] = {'status': [], 'trial': []}
+                aggregated_results[task]['status'].append(value['status'])
+                aggregated_results[task]['trial'].append(value['trial'])
+        log('-----Supervisor (or feedback) Evaluation----')
+        for task, value in aggregated_results.items():
+            status = sum(value['status'], [])
+            trial = sum(value['trial'], [])
+            if task == 'intake':
+                total_length = len(status)
+                supervisor_effect_cnt, correct, error, tie = 0, 0, 0, 0
+                for t in trial:
+                    if 'mismatch' in t[0]:
+                        supervisor_effect_cnt += 1
+                        if 'better' in t[0]:
+                            correct += 1
+                        elif 'worse' in t[0]:
+                            error += 1
+                        else:
+                            tie += 1
+                correct_p = correct/supervisor_effect_cnt*100 if supervisor_effect_cnt > 0 else 0
+                error_p = error/supervisor_effect_cnt*100 if supervisor_effect_cnt > 0 else 0
+                tie_p = tie/supervisor_effect_cnt*100 if supervisor_effect_cnt > 0 else 0
+                log(f'{colorstr(task):<27} | length: {total_length}, effected: {supervisor_effect_cnt} ({(supervisor_effect_cnt/total_length)*100:.2f}%)')
+                log(f'    - {colorstr("green", "correct")}: {correct} ({correct_p:.2f}%), {colorstr("red", "worse")}: {error} ({error_p:.2f}%), {colorstr("yellow", "tie")}: {tie} ({tie_p:.2f}%)')
+            elif task == 'schedule':
+                feedback_n = dict()
+                total_length = len(status)
+                supervisor_effect_cnt, correct, tie = 0, 0, 0
+                for t in trial:
+                    if isinstance(t, list) and len(t) > 1:
+                        supervisor_effect_cnt += 1
+                        if t[-1] == 'pass':
+                            correct += 1
+                            feedback_n[len(t)-1] = feedback_n.setdefault(len(t)-1, 0) + 1
+                        else:
+                            tie += 1
+                desc = ', '.join([f'{f}-feedback: {n}' for f, n in sorted(feedback_n.items())])
+                correct_p = correct/supervisor_effect_cnt*100 if supervisor_effect_cnt > 0 else 0
+                tie_p = tie/supervisor_effect_cnt*100 if supervisor_effect_cnt > 0 else 0
+                log(f'{colorstr(task):<27} | length: {total_length}, effected: {supervisor_effect_cnt} ({(supervisor_effect_cnt/total_length)*100:.2f}%)')
+                log(f'    - {colorstr("green", "correct")}: {correct} ({correct_p:.2f}%), {colorstr("yellow", "tie")}: {tie} ({tie_p:.2f}%)')
+                log(f'    - Feedback distribution: {desc}')
+    def human_evaluation(self):
+        """
+        Aggregate and evaluate human evaluation results from text files.
+        """
+        scores = {'arena': dict(), 'score': dict()}
+        all_lines = list()
+        for file in self.human_eval_files:
+            with open(file, 'r') as f:
+                lines = f.readlines()
+                all_lines.extend([line.strip() for line in lines if line.strip()])
+        for line in all_lines:
+            arena, score_a, score_b, model_a, model_b = line.split('\t')
+            scores['arena'].setdefault(model_a, 0)
+            scores['arena'].setdefault(model_b, 0)
+            scores['score'].setdefault(model_a, [])
+            scores['score'].setdefault(model_b, [])
+            if arena == 'A':
+                scores['arena'][model_a] += 1
+            else:
+                scores['arena'][model_b] += 1
+            scores['score'][model_a].append(float(score_a))
+            scores['score'][model_b].append(float(score_b))
+        log('--------------Human Evaluation--------------')
+        for model in scores['arena'].keys():
+            arena_wins = scores['arena'][model]
+            score_list = scores['score'][model]
+            avg_score = sum(score_list) / len(score_list)
+            stdv = round((sum((x - avg_score) ** 2 for x in score_list) / len(score_list)) ** 0.5, 2) if len(score_list) > 1 else 0.0
+            log(f'{colorstr(model):<15} | Arena wins: {colorstr("green", str(arena_wins))}, Average score: {colorstr("green", f"{avg_score:.2f} ± {stdv}")}')
+    def department_evaluation(self):
+        """
+        Evaluate solely department prediction accuracy.
+        """
+        aggregated_results = {'intake': {'gt': [], 'pred': [], 'status': []}}
+        for file in self.files:
+            data = json_load(file)
+            aggregated_results['intake']['gt'].extend(data['intake']['gt'])
+            aggregated_results['intake']['pred'].extend(data['intake']['pred'])
+            aggregated_results['intake']['status'].extend(data['intake']['status'])
+        gt = aggregated_results['intake']['gt']
+        pred = aggregated_results['intake']['pred']
+        status = aggregated_results['intake']['status']
+        total_n, dept_err_n = len(gt), 0
+        for g, p, s in zip(gt, pred, status):
+            if not s:
+                gt_depts = g['department']
+                pred_dept = p['department'][0]
+                if pred_dept not in gt_depts:
+                    dept_err_n += 1
+        log('--------------Department Evaluation--------------')
+        log(f'Error rate: {colorstr("red", f"{(dept_err_n/total_n)*100:.2f}%")}, length: {dept_err_n} / {total_n}')
+    def calculate_avg_rounds(self):
+        """
+        Calculate average required intake rounds
+        """
+        counts = list()
+        for file in self.dialog_files:
+            data = json_load(file)
+            dialogs = list(data.values())
+            for dialog in dialogs:
+                counts.append(dialog.count('Staff: ')-1)
+        mean, stdv = np.mean(counts), np.std(counts)
+        log('-----------------Average Rounds-----------------')
+        log(f'Average Rounds: {mean:.2f} ± {stdv:.2f}')