user-simulator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. user_sim/__init__.py +0 -0
  2. user_sim/cli/__init__.py +0 -0
  3. user_sim/cli/gen_user_profile.py +34 -0
  4. user_sim/cli/init_project.py +65 -0
  5. user_sim/cli/sensei_chat.py +481 -0
  6. user_sim/cli/sensei_check.py +103 -0
  7. user_sim/cli/validation_check.py +143 -0
  8. user_sim/core/__init__.py +0 -0
  9. user_sim/core/ask_about.py +665 -0
  10. user_sim/core/data_extraction.py +260 -0
  11. user_sim/core/data_gathering.py +134 -0
  12. user_sim/core/interaction_styles.py +147 -0
  13. user_sim/core/role_structure.py +608 -0
  14. user_sim/core/user_simulator.py +302 -0
  15. user_sim/handlers/__init__.py +0 -0
  16. user_sim/handlers/asr_module.py +128 -0
  17. user_sim/handlers/html_parser_module.py +202 -0
  18. user_sim/handlers/image_recognition_module.py +139 -0
  19. user_sim/handlers/pdf_parser_module.py +123 -0
  20. user_sim/utils/__init__.py +0 -0
  21. user_sim/utils/config.py +47 -0
  22. user_sim/utils/cost_tracker.py +153 -0
  23. user_sim/utils/cost_tracker_v2.py +193 -0
  24. user_sim/utils/errors.py +15 -0
  25. user_sim/utils/exceptions.py +47 -0
  26. user_sim/utils/languages.py +78 -0
  27. user_sim/utils/register_management.py +62 -0
  28. user_sim/utils/show_logs.py +63 -0
  29. user_sim/utils/token_cost_calculator.py +338 -0
  30. user_sim/utils/url_management.py +60 -0
  31. user_sim/utils/utilities.py +568 -0
  32. user_simulator-0.1.0.dist-info/METADATA +733 -0
  33. user_simulator-0.1.0.dist-info/RECORD +37 -0
  34. user_simulator-0.1.0.dist-info/WHEEL +5 -0
  35. user_simulator-0.1.0.dist-info/entry_points.txt +6 -0
  36. user_simulator-0.1.0.dist-info/licenses/LICENSE.txt +21 -0
  37. user_simulator-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,568 @@
1
+ import os
2
+ import pandas as pd
3
+ import yaml
4
+ import json
5
+ import configparser
6
+ import re
7
+ import random
8
+ import importlib.util
9
+ import logging
10
+ import platform
11
+
12
+ from datetime import datetime, timedelta, date
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ from .exceptions import *
16
+ from user_sim.utils import config
17
+ from langchain.chat_models import init_chat_model
18
+ from charset_normalizer import detect
19
+
20
+ logger = logging.getLogger('Info Logger')
21
+
22
+
23
+ def check_keys(key_list: list):
24
+ if os.path.exists("keys.properties"):
25
+ logger.info("properties found!")
26
+ config = configparser.ConfigParser()
27
+ config.read('keys.properties')
28
+
29
+ # Loop over all keys and values
30
+ for key in config['keys']:
31
+ key = key.upper()
32
+ os.environ[key] = config['keys'][key]
33
+
34
+ for k in key_list:
35
+ if not os.environ.get(k):
36
+ raise Exception(f"{k} not found")
37
+
38
+
39
+ def end_alarm():
40
+ os_name = platform.system()
41
+ if os_name == "Windows":
42
+ import winsound
43
+ winsound.PlaySound('config/misc/sound/c1bccaed.wav', winsound.SND_FILENAME)
44
+
45
+
46
+ def init_model():
47
+ model = config.model
48
+ model_provider = config.model_provider
49
+ # tracker = CostTrackingCallback()
50
+ if model_provider is None:
51
+ params = {
52
+ "model": model,
53
+ # "callbacks": [tracker]
54
+ }
55
+ else:
56
+ params = {
57
+ "model": model,
58
+ "model_provider": model_provider,
59
+ # "callbacks": [tracker]
60
+ }
61
+ llm = init_chat_model(**params)
62
+
63
+ return model, llm
64
+
65
+
66
+
67
+ def parse_content_to_text(messages):
68
+ return " ".join([message["content"] for message in messages if "content" in message])
69
+
70
+
71
+ def get_encoding(encoded_file):
72
+ with open(encoded_file, 'rb') as file:
73
+ detected = detect(file.read())
74
+ return detected
75
+
76
+
77
+ def save_json(msg, test_name, path):
78
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
79
+ file_path = os.path.join(path, f'{test_name}_{timestamp}.json')
80
+ with open(file_path, 'w', encoding='utf-8') as file:
81
+ json.dump(msg, file, indent=4)
82
+
83
+
84
+ def str_to_bool(s):
85
+ if s.lower() in ['true', '1', 'yes', 'y']:
86
+ return True
87
+ elif s.lower() in ['false', '0', 'no', 'n']:
88
+ return False
89
+ else:
90
+ raise ValueError(f"Cannot convert {s} to boolean")
91
+
92
+
93
+
94
+ def execute_list_function(path, function_name, arguments=None):
95
+ spec = importlib.util.spec_from_file_location("my_module", path)
96
+ my_module = importlib.util.module_from_spec(spec)
97
+ spec.loader.exec_module(my_module)
98
+
99
+ function_to_execute = getattr(my_module, function_name)
100
+
101
+ if arguments:
102
+
103
+ if not isinstance(arguments, list):
104
+ arguments = [arguments]
105
+
106
+ args = [item for item in arguments if not isinstance(item, dict)]
107
+ dict_list = [item for item in arguments if isinstance(item, dict)]
108
+ kwargs = {k: v for dic in dict_list for k, v in dic.items()}
109
+
110
+ try:
111
+ result = function_to_execute(*args, **kwargs)
112
+ except TypeError as e:
113
+ raise InvalidFormat(f"No arguments needed for this function: {e}")
114
+
115
+ else:
116
+ try:
117
+ result = function_to_execute()
118
+ except TypeError as e:
119
+ raise InvalidFormat(f"Arguments are needed for this function: {e}")
120
+
121
+ return result
122
+
123
+
124
+ def list_to_phrase(s_list: list, prompted=False): # todo: cambiar a list_to_askabout
125
+ # s_list: list of strings
126
+ # l_string: string values extracted from s_list in string format
127
+ l_string = s_list[0]
128
+
129
+ if len(s_list) <= 1:
130
+ return f"{s_list[0]}"
131
+ else:
132
+ for i in range(len(s_list) - 1):
133
+ if s_list[i + 1] == s_list[-1]:
134
+ l_string = f" {l_string} or {s_list[i + 1]}"
135
+ else:
136
+ l_string = f" {l_string}, {s_list[i + 1]}"
137
+
138
+ if prompted:
139
+ l_string = "please, ask about" + l_string
140
+
141
+ return l_string
142
+
143
+
144
+ def read_yaml(file):
145
+
146
+ if not file.lower().endswith(('.yaml', '.yml')):
147
+ raise InvalidFile("File type is not a YAML.")
148
+ try:
149
+ with open(file, 'r', encoding="UTF-8") as f:
150
+ yaml_file = yaml.safe_load(f)
151
+ return yaml_file
152
+ except yaml.YAMLError as e:
153
+ raise e
154
+
155
+
156
+
157
+
158
+ def generate_serial():
159
+ now = datetime.now()
160
+ # serial = datetime.now().strftime("%Y%m%d%H%M%S") + f"{now.microsecond // 1000:03d}"
161
+ serial = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
162
+ return serial
163
+
164
+
165
+ class MyDumper(yaml.Dumper):
166
+ def write_line_break(self, data=None):
167
+ super().write_line_break(data)
168
+ super().write_line_break(data)
169
+
170
+
171
+ def save_test_conv(history, metadata, test_name, path, serial, conversation_time, response_time, av_data, counter):
172
+ print("Saving conversation...")
173
+
174
+ cr_time = {'conversation time': conversation_time,
175
+ 'assistant response time': response_time,
176
+ "response time report": get_time_stats(response_time)}
177
+
178
+ path_folder = path + f"/conversation_outputs/{test_name}"
179
+ if not os.path.exists(path_folder):
180
+ os.makedirs(path_folder)
181
+
182
+ data = [metadata, cr_time, history]
183
+ test_folder = path_folder + f"/{serial}"
184
+
185
+ if not os.path.exists(test_folder):
186
+ os.makedirs(test_folder)
187
+
188
+ file_path_yaml = os.path.join(test_folder, f'{counter}_{test_name}_{serial}.yml')
189
+ file_path_csv = os.path.join(test_folder, f'{counter}_{test_name}_{serial}.csv')
190
+
191
+ with open(file_path_yaml, "w", encoding="UTF-8") as archivo:
192
+ yaml.dump_all(data, archivo, allow_unicode=True, default_flow_style=False, sort_keys=False)
193
+ if av_data[1]:
194
+ av_data[0].to_csv(file_path_csv, index=True, sep=';', header=True, columns=['verification', 'data'])
195
+
196
+ print(f"Conversation saved in {path}")
197
+ print('------------------------------')
198
+ config.errors.clear()
199
+
200
+
201
+ def get_error_stats(error_df):
202
+ error_list = error_df['error_code'].unique()
203
+
204
+ error_report = []
205
+ for error in error_list:
206
+ error_report.append({'error': error,
207
+ 'count': error_df[error_df['error_code'] == error].shape[0],
208
+ 'conversations': list(error_df[error_df['error_code'] == error]['conversation'])
209
+ })
210
+
211
+ return error_report
212
+
213
+
214
+ def get_time_stats(response_time):
215
+ times = pd.to_timedelta(response_time, unit='s')
216
+
217
+ time_report = {
218
+ 'average': round(times.mean().total_seconds(), 6),
219
+ 'max': round(times.max().total_seconds(), 6),
220
+ 'min': round(times.min().total_seconds(), 6)
221
+ }
222
+ return time_report
223
+
224
+ class ExecutionStats:
225
+ def __init__(self, test_cases_folder, serial):
226
+
227
+ self.path = test_cases_folder
228
+ self.test_names = []
229
+ self.serial = serial
230
+ self.export = False
231
+ self.profile_art = []
232
+ self.profile_edf = []
233
+ self.global_time_stats = []
234
+ self.global_error_stats = None
235
+
236
+ def add_test_name(self, test_name):
237
+ if isinstance(test_name, str):
238
+ self.test_names.append(test_name)
239
+ elif isinstance(test_name, list):
240
+ self.test_names += test_name
241
+
242
+ def reset(self):
243
+ self.test_names = []
244
+ self.export = False
245
+
246
+ def get_stats(self):
247
+
248
+ path_folder = self.path + f"/conversation_outputs/{self.test_names[-1]}" + f"/{self.serial}" # todo: except for empty test_names list
249
+
250
+ assistant_response_times = []
251
+ error_df = pd.DataFrame(columns=["conversation", "error_code"])
252
+
253
+ for file in os.listdir(path_folder):
254
+ if file.endswith(('.yaml', '.yml')):
255
+ file_path = os.path.join(path_folder, file)
256
+ file_name = file
257
+ with open(file_path, 'r', encoding='utf-8') as yaml_file:
258
+ try:
259
+ yaml_content = list(yaml.safe_load_all(yaml_file))
260
+ if "assistant response time" in yaml_content[1]:
261
+ assistant_response_times += yaml_content[1]['assistant response time']
262
+
263
+ if "errors" in yaml_content[0] and 'serial' in yaml_content[0]:
264
+ for error in yaml_content[0]['errors']:
265
+
266
+ error_df = pd.concat(
267
+ [error_df, pd.DataFrame({'conversation': [file_name],
268
+ 'error_code': list(error.keys())})],
269
+ ignore_index=True
270
+ )
271
+ except yaml.YAMLError as e:
272
+ print(f'error while processing the file {yaml_file}: {e}')
273
+
274
+ self.profile_art.append(assistant_response_times)
275
+ self.profile_edf.append(error_df)
276
+
277
+ def show_last_stats(self):
278
+ cost_ds = pd.read_csv(config.cost_ds_path, encoding=get_encoding(config.cost_ds_path)["encoding"])
279
+ self.get_stats()
280
+
281
+ time_stats = get_time_stats(self.profile_art[-1])
282
+ print(f"Average assistant response time: {time_stats['average']} (s)")
283
+ print(f"Maximum assistant response time: {time_stats['max']} (s)")
284
+ print(f"Minimum assistant response time: {time_stats['min']} (s)")
285
+
286
+ error_stats = get_error_stats(self.profile_edf[-1])
287
+ for error in error_stats:
288
+ print(f"Found error {error['error']}: \n "
289
+ f"- Count: {error['count']} \n "
290
+ f"- Conversations: {error['conversations']}")
291
+
292
+ total_cost = round(float(cost_ds[cost_ds["Test Name"] == config.test_name]["Total Cost"].sum()), 8)
293
+ print(f"Total Cost: ${total_cost}")
294
+
295
+ print('------------------------------\n'
296
+ '------------------------------')
297
+
298
+ def show_global_stats(self):
299
+ cost_ds = pd.read_csv(config.cost_ds_path, encoding=get_encoding(config.cost_ds_path)["encoding"])
300
+ self.global_time_stats = [time for profile in self.profile_art for time in profile]
301
+ self.global_error_stats = pd.concat(self.profile_edf, ignore_index=True)
302
+
303
+ time_stats = get_time_stats(self.global_time_stats)
304
+ print(f"Average assistant response time: {time_stats['average']} (s)")
305
+ print(f"Maximum assistant response time: {time_stats['max']} (s)")
306
+ print(f"Minimum assistant response time: {time_stats['min']} (s)")
307
+
308
+ error_stats = get_error_stats(self.global_error_stats)
309
+ for error in error_stats:
310
+ print(f"Found error {error['error']}: \n "
311
+ f"- Count: {error['count']} \n "
312
+ f"- Conversations: {error['conversations']}")
313
+
314
+ total_cost = round(float(cost_ds["Total Cost"].sum()), 8)
315
+ print(f"Total Cost: ${total_cost}")
316
+
317
+ print('------------------------------\n'
318
+ '------------------------------')
319
+
320
+ def export_stats(self):
321
+ export_path = self.path + f"/reports/__stats_reports__"
322
+ cost_ds = pd.read_csv(config.cost_ds_path, encoding=get_encoding(config.cost_ds_path)["encoding"])
323
+
324
+ if not os.path.exists(export_path):
325
+ os.makedirs(export_path)
326
+
327
+ single_reports = []
328
+ for index, name in enumerate(self.test_names):
329
+ time_stats = get_time_stats(self.profile_art[index])
330
+ error_stats = get_error_stats(self.profile_edf[index])
331
+ total_cost = round(float(cost_ds[cost_ds["Test Name"]==name]["Total Cost"].sum()), 8)
332
+
333
+ single_reports.append({
334
+ "Test name": name,
335
+ "Average assistant response time": time_stats['average'],
336
+ "Maximum assistant response time": time_stats['max'],
337
+ "Minimum assistant response time": time_stats['min'],
338
+ "Errors": error_stats,
339
+ "Total Cost": total_cost
340
+ })
341
+
342
+ glb_time_stats = get_time_stats(self.global_time_stats)
343
+ glb_error_stats = get_error_stats(self.global_error_stats)
344
+ glb_total_cost = round(float(cost_ds["Total Cost"].sum()), 8)
345
+
346
+ global_reports = {
347
+ "Global report": {
348
+ "Average assistant response time": glb_time_stats['average'],
349
+ "Maximum assistant response time": glb_time_stats['max'],
350
+ "Minimum assistant response time": glb_time_stats['min'],
351
+ "Errors": glb_error_stats,
352
+ "Total Cost": glb_total_cost
353
+ }
354
+ }
355
+
356
+ export_file_name = export_path + f"/report_{self.serial}.yml"
357
+ data = [global_reports] + single_reports
358
+
359
+ with open(export_file_name, "w", encoding="UTF-8") as archivo:
360
+ yaml.dump_all(data, archivo, allow_unicode=True, default_flow_style=False, sort_keys=False)
361
+ logger.info(f"report file saved at {export_file_name}")
362
+
363
+
364
+ def response_processor(response):
365
+ pass
366
+
367
+ def normalize_regex_pattern(pattern: str) -> str:
368
+ if pattern.startswith('r"') and pattern.endswith('"'):
369
+ pattern = pattern[2:-1]
370
+
371
+ return pattern
372
+
373
+ def preprocess_text(text):
374
+ # Convertir a minúsculas
375
+ text = text.lower()
376
+ # Eliminar signos de puntuación
377
+ text = re.sub(r'[^\w\s]', '', text)
378
+ return text
379
+
380
+
381
+ def str_to_bool(s):
382
+ return {'true': True, 'false': False}[s.lower()]
383
+
384
+
385
+ def nlp_processor(msg, patterns=None, threshold=0.5):
386
+ read_patterns = [patterns]
387
+
388
+ prepro_patterns = [preprocess_text(pattern) for pattern in read_patterns]
389
+
390
+ vectorizer = TfidfVectorizer().fit(prepro_patterns)
391
+
392
+ processed_msg = preprocess_text(msg)
393
+
394
+ # Vectorizar el mensaje y los patrones de fallback
395
+ vectors = vectorizer.transform([processed_msg] + prepro_patterns)
396
+ vector_msg = vectors[0]
397
+ patt_msg = vectors[1:]
398
+
399
+ # Calcular similitud de coseno
400
+ similarities = cosine_similarity(vector_msg, patt_msg)
401
+ max_sim = similarities.max()
402
+
403
+ # Definir un umbral de similitud para detectar fallback
404
+
405
+ return max_sim >= threshold
406
+
407
+
408
+ def build_sequence(pairs):
409
+ mapping = {}
410
+ starts = set()
411
+ ends = set()
412
+ for a, b in pairs:
413
+ mapping[a] = b
414
+ starts.add(a)
415
+ if b is not None:
416
+ ends.add(b)
417
+ # Find starting words (appear in 'starts' but not in 'ends')
418
+ start_words = starts - ends
419
+ start_words.discard(None)
420
+ sequences = []
421
+ for start_word in start_words:
422
+ sequence = [start_word]
423
+ current_word = start_word
424
+ while current_word in mapping and mapping[current_word] is not None:
425
+ current_word = mapping[current_word]
426
+ sequence.append(current_word)
427
+ sequences.append(sequence)
428
+
429
+ if not sequences:
430
+ raise ValueError("Cannot determine a unique starting point.")
431
+ return sequences
432
+
433
+
434
+ def get_random_date():
435
+ year = random.randint(0, 3000)
436
+ month = random.randint(1, 12)
437
+
438
+ if month in [1, 3, 5, 7, 8, 10, 12]:
439
+ day = random.randint(1, 31)
440
+ elif month == 2:
441
+ if year % 4 == 0:
442
+ day = random.randint(1, 29)
443
+ else:
444
+ day = random.randint(1, 28)
445
+ else:
446
+ day = random.randint(1, 30)
447
+
448
+ return f"{day}/{month}/{year}"
449
+
450
+
451
+ def get_date_range(start, end, step, date_type):
452
+ if 'linspace' in date_type:
453
+ total_seconds = (end - start).total_seconds()
454
+ interval_seconds = total_seconds / (step - 1) if step > 1 else 0
455
+ range_date_list = [(start + timedelta(seconds=interval_seconds * i)).strftime('%d/%m/%Y') for i in range(step)]
456
+
457
+ elif date_type in ['day', 'month', 'year']:
458
+ if 'month' in date_type:
459
+ step = 30 * step
460
+ elif 'year' in date_type:
461
+ step = 365 * step
462
+
463
+ range_date_list = [start.strftime('%d/%m/%Y')]
464
+ while end > start:
465
+ start = start + timedelta(days=step)
466
+ range_date_list.append(start.strftime('%d/%m/%Y'))
467
+
468
+ elif 'random' in date_type:
469
+ delta = end - start
470
+ random_dates = [
471
+ (start + timedelta(days=random.randint(0, delta.days))).strftime('%d/%m/%Y') for _ in range(step)
472
+ ]
473
+ return random_dates
474
+
475
+ else:
476
+ raise InvalidFormat(f"The following parameter does not belong to date range field: {date_type}")
477
+
478
+ return range_date_list
479
+
480
+ def get_fake_date():
481
+
482
+ fake_day = random.randint(29, 99)
483
+ fake_month = random.randint(13, 99)
484
+ fake_year = random.randint(2000, 2099)
485
+
486
+ return f"{fake_day}/{fake_month}/{fake_year}"
487
+
488
+
489
+ def get_date_list(date):
490
+ custom_dates = []
491
+ generated_dates = []
492
+ if 'custom' in date:
493
+ if isinstance(date['custom'], list):
494
+ custom_dates = date['custom']
495
+ else:
496
+ custom_dates = [date['custom']]
497
+
498
+ if 'random' in date:
499
+ value = date['random']
500
+ random_dates = []
501
+ for i in range(value):
502
+ str_date = get_random_date()
503
+ random_dates.append(str_date)
504
+ generated_dates += random_dates
505
+
506
+ if 'set' in date:
507
+ value = int(re.findall(r'today\((.*?)\)', date['set'])[0])
508
+
509
+ if '>today' in date['set']:
510
+ today = datetime.now()
511
+ next_dates = [
512
+ (today + timedelta(days=random.randint(1, 365))).strftime('%d/%m/%Y') for _ in range(value)
513
+ ]
514
+ generated_dates += next_dates
515
+
516
+ elif '<today' in date['set']:
517
+ today = datetime.now()
518
+ previous_dates = [
519
+ (today - timedelta(days=random.randint(1, 365))).strftime('%d/%m/%Y') for _ in range(value)
520
+ ]
521
+ generated_dates += previous_dates
522
+
523
+ if 'range' in date:
524
+ start = datetime.strptime(date['range']['min'], '%d/%m/%Y')
525
+ end = datetime.strptime(date['range']['max'], '%d/%m/%Y')
526
+ if 'step' in date['range']:
527
+ step_value = int(re.findall(r'\((.*?)\)', date['range']['step'])[0])
528
+
529
+ if 'linspace' in date['range']['step']:
530
+ list_of_dates = get_date_range(start, end, step_value, 'linspace')
531
+ generated_dates += list_of_dates
532
+
533
+ elif 'day' in date['range']['step']:
534
+ list_of_dates = get_date_range(start, end, step_value, 'day')
535
+ generated_dates += list_of_dates
536
+
537
+ elif 'month' in date['range']['step']:
538
+ list_of_dates = get_date_range(start, end, step_value, 'month')
539
+ generated_dates += list_of_dates
540
+
541
+ elif 'year' in date['range']['step']:
542
+ list_of_dates = get_date_range(start, end, step_value, 'year')
543
+ generated_dates += list_of_dates
544
+ else:
545
+ raise InvalidFormat(f"The following parameter does not belong "
546
+ f"to date range field: {date['range']['step']}")
547
+
548
+ elif 'random' in date['range']:
549
+ value = date['range']['random']
550
+ list_of_dates = get_date_range(start, end, value, 'random')
551
+ generated_dates += list_of_dates
552
+
553
+ if 'fake' in date:
554
+ num_dates = date["fake"]
555
+
556
+ fake_date_list = []
557
+ while len(fake_date_list) < num_dates:
558
+ fake_date = get_fake_date()
559
+ if fake_date not in fake_date_list:
560
+ fake_date_list.append(get_fake_date())
561
+
562
+ generated_dates += fake_date_list
563
+
564
+ final_date_list = generated_dates + custom_dates
565
+ return final_date_list
566
+
567
+
568
+